Merge aom/master and webm/nextgenv2 Change-Id: Ic9679c0551f809807bd5c1196c302b072e7fa613

commit: c1c6f6a2dde99d6ea90c34e2e2f1c28e98976f98 [log] [tgz]
author: Yaowu Xu <yaowu@google.com> Tue Nov 08 11:24:01 2016 -0800
committer: Yaowu Xu <yaowu@google.com> Tue Nov 08 11:31:16 2016 -0800
tree: f06404f4e84c1384dfffdc640957c3a2600471d2
parent: 6515afc6b90751d86b3e2f77336cd374d80896d7 [diff]
parent: 8b0f636831d18ed770fc9b2fbd9098cbb616251e [diff]
diff --git a/CHANGELOG b/CHANGELOG
index f6c2e03..dad0ea1 100644
--- a/CHANGELOG
+++ b/CHANGELOG

@@ -1,6 +1,9 @@
+Next Release
+  - Incompatible changes:
+    The AV1 encoder's default keyframe interval changed to 128 from 9999.
+
 2016-04-07 v0.1.0 "AOMedia Codec 1"
   This release is the first Alliance for Open Media codec.
-
 2015-11-09 v1.5.0 "Javan Whistling Duck"
   This release improves upon the VP9 encoder and speeds up the encoding and
   decoding processes.

diff --git a/PATENTS b/PATENTS
index d57102a..be491f5 100644
--- a/PATENTS
+++ b/PATENTS

@@ -57,10 +57,10 @@
 
 2. Definitions.
 
-2.1. Affiliate.  “Affiliate” means an entity that directly or indirectly
+2.1. Affiliate.  Affiliate means an entity that directly or indirectly
      Controls, is Controlled by, or is under common Control of that party.
 
-2.2. Control. “Control” means direct or indirect control of more than 50% of
+2.2. Control. Control means direct or indirect control of more than 50% of
      the voting power to elect directors of that corporation, or for any other
      entity, the power to direct management of such entity.
 
@@ -70,7 +70,7 @@
 2.4. Encoder.  "Encoder" means any encoder that produces a bitstream that can
      be decoded by a Decoder only to the extent it produces such a bitstream.
 
-2.5. Final Deliverable.  “Final Deliverable” means the final version of a
+2.5. Final Deliverable.  Final Deliverable means the final version of a
      deliverable approved by the Alliance for Open Media as a Final
      Deliverable.
 
@@ -79,9 +79,9 @@
      Implementation also includes components of an Implementation only to the
      extent they are used as part of an Implementation.
 
-2.7. License. “License” means this license.
+2.7. License. License means this license.
 
-2.8. Licensee. “Licensee” means any person or entity who exercises patent
+2.8. Licensee. Licensee means any person or entity who exercises patent
      rights granted under this License.
 
 2.9. Licensor.  "Licensor" means (i) any Licensee that makes, sells, offers
@@ -98,11 +98,11 @@
       as if the Specification was a W3C Recommendation; or (ii) are infringed
       by the Reference Implementation.
 
-2.11. Reference Implementation. “Reference Implementation” means an Encoder
+2.11. Reference Implementation. Reference Implementation means an Encoder
       and/or Decoder released by the Alliance for Open Media as a Final
       Deliverable.
 
-2.12. Specification. “Specification” means the specification designated by
+2.12. Specification. Specification means the specification designated by
       the Alliance for Open Media as a Final Deliverable for which this
       License was issued.
 

diff --git a/README b/README
index d7aeeaa..3e7dfb8 100644
--- a/README
+++ b/README

@@ -1,6 +1,6 @@
-README - 22 March 2016
+README - 23 March 2015
 
-Welcome to the AOM Codec SDK!
+Welcome to the WebM VP8/AV1 Codec SDK!
 
 COMPILING THE APPLICATIONS/LIBRARIES:
   The build system used is similar to autotools. Building generally consists of
@@ -47,7 +47,6 @@
   --help output of the configure script. As of this writing, the list of
   available targets is:
 
-    armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
     armv6-none-rvct
@@ -120,15 +119,30 @@
   This defaults to config.log. This should give a good indication of what went
   wrong. If not, contact us for support.
 
-AOM TEST VECTORS:
+VP8/AV1 TEST VECTORS:
   The test vectors can be downloaded and verified using the build system after
   running configure. To specify an alternate directory the
   LIBAOM_TEST_DATA_PATH environment variable can be used.
 
   $ ./configure --enable-unit-tests
-  $ LIBAOM_TEST_DATA_PATH=../libaom-test-data make testdata
+  $ LIBAOM_TEST_DATA_PATH=../-test-data make testdata
+
+CODE STYLE:
+  The coding style used by this project is enforced with clang-format using the
+  configuration contained in the .clang-format file in the root of the
+  repository.
+
+  Before pushing changes for review you can format your code with:
+  # Apply clang-format to modified .c, .h and .cc files
+  $ clang-format -i --style=file \
+    $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc')
+
+  Check the .clang-format file for the version used to generate it if there is
+  any difference between your local formatting and the review system.
+
+  See also: http://clang.llvm.org/docs/ClangFormat.html
 
 SUPPORT
   This library is an open source project supported by its community. Please
-  please email aomediacodec-chair@jointdevelopment.kavi.com for help.
+  please email webm-discuss@webmproject.org for help.
 

diff --git a/aom/aom.h b/aom/aom.h
index a73b6ab..98366b8 100644
--- a/aom/aom.h
+++ b/aom/aom.h

@@ -43,9 +43,9 @@
  * The set of macros define the control functions of AOM interface
  */
 enum aom_com_control_id {
-  AOM_SET_REFERENCE =
-      1, /**< pass in an external frame into decoder to be used as reference
-            frame */
+  /*!\brief pass in an external frame into decoder to be used as reference frame
+   */
+  AOM_SET_REFERENCE = 1,
   AOM_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
   AOM_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
   AOM_SET_DBG_COLOR_REF_FRAME =
@@ -60,6 +60,9 @@
    */
   AV1_GET_REFERENCE = 128, /**< get a pointer to a reference frame */
   AOM_COMMON_CTRL_ID_MAX,
+
+  AV1_GET_NEW_FRAME_IMAGE = 192, /**< get a pointer to the new frame */
+
   AOM_DECODER_CTRL_ID_START = 256
 };
 
@@ -88,8 +91,9 @@
  */
 
 typedef struct aom_postproc_cfg {
-  int post_proc_flag;   /**< the types of post processing to be done, should be
-                           combination of "aom_postproc_level" */
+  /*!\brief the types of post processing to be done, should be combination of
+   * "aom_postproc_level" */
+  int post_proc_flag;
   int deblocking_level; /**< the strength of deblocking, valid range [0, 16] */
   int noise_level; /**< the strength of additive noise, valid range [0, 16] */
 } aom_postproc_cfg_t;
@@ -143,6 +147,8 @@
 #define AOM_CTRL_AOM_SET_DBG_DISPLAY_MV
 AOM_CTRL_USE_TYPE(AV1_GET_REFERENCE, av1_ref_frame_t *)
 #define AOM_CTRL_AV1_GET_REFERENCE
+AOM_CTRL_USE_TYPE(AV1_GET_NEW_FRAME_IMAGE, aom_image_t *)
+#define AOM_CTRL_AV1_GET_NEW_FRAME_IMAGE
 
 /*!\endcond */
 /*! @} - end defgroup aom */

diff --git a/aom/aom_codec.h b/aom/aom_codec.h
index e1c48ec..1d301d1 100644
--- a/aom/aom_codec.h
+++ b/aom/aom_codec.h

@@ -61,8 +61,8 @@
 #if defined(__GNUC__) && __GNUC__
 #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #elif defined(_MSC_VER)
+/*!\brief \copydoc #DEPRECATED */
 #define DECLSPEC_DEPRECATED __declspec(deprecated)
-/**< \copydoc #DEPRECATED */
 #else
 #define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
 #endif
@@ -76,6 +76,17 @@
 #define UNUSED
 #endif
 
+/*!\brief Decorator indicating that given struct/union/enum is packed */
+#ifndef ATTRIBUTE_PACKED
+#if defined(__GNUC__) && __GNUC__
+#define ATTRIBUTE_PACKED __attribute__((packed))
+#elif defined(_MSC_VER)
+#define ATTRIBUTE_PACKED
+#else
+#define ATTRIBUTE_PACKED
+#endif
+#endif /* ATTRIBUTE_PACKED */
+
 /*!\brief Current ABI version number
  *
  * \internal
@@ -216,6 +227,18 @@
   AOM_BITS_12 = 12, /**< 12 bits */
 } aom_bit_depth_t;
 
+/*!\brief Superblock size selection.
+ *
+ * Defines the superblock size used for encoding. The superblock size can
+ * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+ * selected by the encoder for each frame.
+ */
+typedef enum aom_superblock_size {
+  AOM_SUPERBLOCK_SIZE_64X64,   /**< Always use 64x64 superblocks. */
+  AOM_SUPERBLOCK_SIZE_128X128, /**< Always use 128x128 superblocks. */
+  AOM_SUPERBLOCK_SIZE_DYNAMIC  /**< Select superblock size dynamically. */
+} aom_superblock_size_t;
+
 /*
  * Library Version Number Interface
  *

diff --git a/aom/aom_encoder.h b/aom/aom_encoder.h
index 4c07a50..09d251a 100644
--- a/aom/aom_encoder.h
+++ b/aom/aom_encoder.h

@@ -32,29 +32,6 @@
 
 #include "./aom_codec.h"
 
-/*! Temporal Scalability: Maximum length of the sequence defining frame
- * layer membership
- */
-#define AOM_TS_MAX_PERIODICITY 16
-
-/*! Temporal Scalability: Maximum number of coding layers */
-#define AOM_TS_MAX_LAYERS 5
-
-/*!\deprecated Use #AOM_TS_MAX_PERIODICITY instead. */
-#define MAX_PERIODICITY AOM_TS_MAX_PERIODICITY
-
-/*! Temporal+Spatial Scalability: Maximum number of coding layers */
-#define AOM_MAX_LAYERS 12  // 3 temporal + 4 spatial layers are allowed.
-
-/*!\deprecated Use #AOM_MAX_LAYERS instead. */
-#define MAX_LAYERS AOM_MAX_LAYERS  // 3 temporal + 4 spatial layers allowed.
-
-/*! Spatial Scalability: Maximum number of coding layers */
-#define AOM_SS_MAX_LAYERS 5
-
-/*! Spatial Scalability: Default number of coding layers */
-#define AOM_SS_DEFAULT_LAYERS 1
-
 /*!\brief Current ABI version number
  *
  * \internal
@@ -96,8 +73,8 @@
  *  The available flags are specified by AOM_CODEC_USE_* defines.
  */
 #define AOM_CODEC_USE_PSNR 0x10000 /**< Calculate PSNR on each frame */
+/*!\brief Make the encoder output one  partition at a time. */
 #define AOM_CODEC_USE_OUTPUT_PARTITION 0x20000
-/**< Make the encoder output one partition at a time. */
 #define AOM_CODEC_USE_HIGHBITDEPTH 0x40000 /**< Use high bitdepth */
 
 /*!\brief Generic fixed size buffer structure
@@ -125,13 +102,13 @@
  */
 typedef uint32_t aom_codec_frame_flags_t;
 #define AOM_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+/*!\brief frame can be dropped without affecting the stream (no future frame
+ * depends on this one) */
 #define AOM_FRAME_IS_DROPPABLE 0x2
-/**< frame can be dropped without affecting the stream (no future frame depends
- * on this one) */
+/*!\brief frame should be decoded but will not be shown */
 #define AOM_FRAME_IS_INVISIBLE 0x4
-/**< frame should be decoded but will not be shown */
+/*!\brief this is a fragment of the encoded frame */
 #define AOM_FRAME_IS_FRAGMENT 0x8
-/**< this is a fragment of the encoded frame */
 
 /*!\brief Error Resilient flags
  *
@@ -140,13 +117,13 @@
  * aom_codec_enc_cfg::g_error_resilient variable.
  */
 typedef uint32_t aom_codec_er_flags_t;
+/*!\brief Improve resiliency against losses of whole frames */
 #define AOM_ERROR_RESILIENT_DEFAULT 0x1
-/**< Improve resiliency against losses of whole frames */
-#define AOM_ERROR_RESILIENT_PARTITIONS 0x2
-/**< The frame partitions are independently decodable by the bool decoder,
+/*!\brief The frame partitions are independently decodable by the bool decoder,
  * meaning that partitions can be decoded even though earlier partitions have
  * been lost. Note that intra prediction is still done over the partition
  * boundary. */
+#define AOM_ERROR_RESILIENT_PARTITIONS 0x2
 
 /*!\brief Encoder output packet variants
  *
@@ -171,19 +148,17 @@
   enum aom_codec_cx_pkt_kind kind; /**< packet variant */
   union {
     struct {
-      void *buf;                        /**< compressed data buffer */
-      size_t sz;                        /**< length of compressed data */
-      aom_codec_pts_t pts;              /**< time stamp to show frame
-                                                 (in timebase units) */
-      unsigned long duration;           /**< duration to show frame
-                                                 (in timebase units) */
-      aom_codec_frame_flags_t flags;    /**< flags for this frame */
-      int partition_id;                 /**< the partition id
-                                       defines the decoding order
-                                       of the partitions. Only
-                                       applicable when "output partition"
-                                       mode is enabled. First partition
-                                       has id 0.*/
+      void *buf; /**< compressed data buffer */
+      size_t sz; /**< length of compressed data */
+      /*!\brief time stamp to show frame (in timebase units) */
+      aom_codec_pts_t pts;
+      /*!\brief duration to show frame (in timebase units) */
+      unsigned long duration;
+      aom_codec_frame_flags_t flags; /**< flags for this frame */
+      /*!\brief the partition id defines the decoding order of the partitions.
+       * Only applicable when "output partition" mode is enabled. First
+       * partition has id 0.*/
+      int partition_id;
     } frame;                            /**< data for compressed frame packet */
     aom_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
     aom_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
@@ -203,22 +178,6 @@
   } data;                                               /**< packet data */
 } aom_codec_cx_pkt_t; /**< alias for struct aom_codec_cx_pkt */
 
-/*!\brief Encoder return output buffer callback
- *
- * This callback function, when registered, returns with packets when each
- * spatial layer is encoded.
- */
-// putting the definitions here for now. (agrange: find if there
-// is a better place for this)
-typedef void (*aom_codec_enc_output_cx_pkt_cb_fn_t)(aom_codec_cx_pkt_t *pkt,
-                                                    void *user_data);
-
-/*!\brief Callback function pointer / user data pair storage */
-typedef struct aom_codec_enc_output_cx_cb_pair {
-  aom_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
-  void *user_priv; /**< Pointer to private data */
-} aom_codec_priv_output_cx_pkt_cb_pair_t;
-
 /*!\brief Rational Number
  *
  * This structure holds a fractional value.
@@ -570,8 +529,7 @@
    * value should be used. Values in between indicate which way the
    * encoder should "lean."
    */
-  unsigned int rc_2pass_vbr_bias_pct; /**< RC mode bias between CBR and
-                                         VBR(0-100: 0->CBR, 100->VBR)   */
+  unsigned int rc_2pass_vbr_bias_pct;
 
   /*!\brief Two-pass mode per-GOP minimum bitrate
    *
@@ -694,7 +652,7 @@
  *
  * \param[in]    iface     Pointer to the algorithm interface to use.
  * \param[out]   cfg       Configuration buffer to populate.
- * \param[in]    reserved  Must set to 0 for AOM and AV1.
+ * \param[in]    reserved  Must set to 0 for VP8 and AV1.
  *
  * \retval #AOM_CODEC_OK
  *     The configuration was populated.
@@ -737,12 +695,12 @@
  */
 aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
 
+/*!\brief deadline parameter analogous to AVx REALTIME mode. */
 #define AOM_DL_REALTIME (1)
-/**< deadline parameter analogous to AVx REALTIME mode. */
+/*!\brief deadline parameter analogous to  AVx GOOD QUALITY mode. */
 #define AOM_DL_GOOD_QUALITY (1000000)
-/**< deadline parameter analogous to AVx GOOD QUALITY mode. */
+/*!\brief deadline parameter analogous to AVx BEST QUALITY mode. */
 #define AOM_DL_BEST_QUALITY (0)
-/**< deadline parameter analogous to AVx BEST QUALITY mode. */
 /*!\brief Encode a frame
  *
  * Encodes a video frame at the given "presentation time." The presentation

diff --git a/aom/aomcx.h b/aom/aomcx.h
index 2a46593..f3edd79 100644
--- a/aom/aomcx.h
+++ b/aom/aomcx.h

@@ -57,48 +57,27 @@
  */
 #define AOM_EFLAG_NO_REF_GF (1 << 17)
 
-#if CONFIG_EXT_REFS
-/*!\brief Don't reference the backward reference frame
- *
- * When this flag is set, the encoder will not use the bwd frame as a
- * predictor. When not set, the encoder will choose whether to use the
- * bwd frame or not automatically.
- */
-#define AOM_EFLAG_NO_REF_BRF (1 << 18)
-
-#endif  // CONFIG_EXT_REFS
-
 /*!\brief Don't reference the alternate reference frame
  *
  * When this flag is set, the encoder will not use the alt ref frame as a
  * predictor. When not set, the encoder will choose whether to use the
  * alt ref frame or not automatically.
  */
-#define AOM_EFLAG_NO_REF_ARF (1 << 19)
+#define AOM_EFLAG_NO_REF_ARF (1 << 21)
 
 /*!\brief Don't update the last frame
  *
  * When this flag is set, the encoder will not update the last frame with
  * the contents of the current frame.
  */
-#define AOM_EFLAG_NO_UPD_LAST (1 << 20)
+#define AOM_EFLAG_NO_UPD_LAST (1 << 18)
 
 /*!\brief Don't update the golden frame
  *
  * When this flag is set, the encoder will not update the golden frame with
  * the contents of the current frame.
  */
-#define AOM_EFLAG_NO_UPD_GF (1 << 21)
-
-#if CONFIG_EXT_REFS
-/*!\brief Don't update the backward reference frame
- *
- * When this flag is set, the encoder will not update the bwd frame with
- * the contents of the current frame.
- */
-#define AOM_EFLAG_NO_UPD_BRF (1 << 22)
-
-#endif  // CONFIG_EXT_REFS
+#define AOM_EFLAG_NO_UPD_GF (1 << 22)
 
 /*!\brief Don't update the alternate reference frame
  *
@@ -112,30 +91,21 @@
  * When this flag is set, the encoder copy the contents of the current frame
  * to the golden frame buffer.
  */
-#define AOM_EFLAG_FORCE_GF (1 << 24)
-
-#if CONFIG_EXT_REFS
-/*!\brief Force backward reference frame update
- *
- * When this flag is set, the encoder copy the contents of the current frame
- * to the bwd frame buffer.
- */
-#define AOM_EFLAG_FORCE_BRF (1 << 25)
-#endif  // CONFIG_EXT_REFS
+#define AOM_EFLAG_FORCE_GF (1 << 19)
 
 /*!\brief Force alternate reference frame update
  *
  * When this flag is set, the encoder copy the contents of the current frame
  * to the alternate reference frame buffer.
  */
-#define AOM_EFLAG_FORCE_ARF (1 << 26)
+#define AOM_EFLAG_FORCE_ARF (1 << 24)
 
 /*!\brief Disable entropy update
  *
  * When this flag is set, the encoder will not update its internal entropy
  * model based on the entropy of this frame.
  */
-#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 27)
+#define AOM_EFLAG_NO_UPD_ENTROPY (1 << 20)
 
 /*!\brief AVx encoder control functions
  *
@@ -145,21 +115,27 @@
  * \sa #aom_codec_control
  */
 enum aome_enc_control_id {
+  /*!\brief Codec control function to set which reference frame encoder can use.
+   *
+   * Supported in codecs: VP8, AV1
+   */
+  AOME_USE_REFERENCE = 7,
+
   /*!\brief Codec control function to pass an ROI map to encoder.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_ROI_MAP = 8,
 
   /*!\brief Codec control function to pass an Active map to encoder.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_ACTIVEMAP,
 
   /*!\brief Codec control function to set encoder scaling mode.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_SCALEMODE = 11,
 
@@ -169,43 +145,52 @@
    * of motion estimation methods. Values greater than 0 will increase encoder
    * speed at the expense of quality.
    *
-   * \note Valid range for AOM: -16..16
+   * \note Valid range for VP8: -16..16
    * \note Valid range for AV1: -8..8
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_CPUUSED = 13,
 
   /*!\brief Codec control function to enable automatic set and use alf frames.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_ENABLEAUTOALTREF,
 
+#if CONFIG_EXT_REFS
+  /*!\brief Codec control function to enable automatic set and use
+   * bwd-pred frames.
+   *
+   * Supported in codecs: AV1
+   */
+  AOME_SET_ENABLEAUTOBWDREF,
+#endif  // CONFIG_EXT_REFS
+
   /*!\brief control function to set noise sensitivity
    *
    * 0: off, 1: OnYOnly, 2: OnYUV,
    * 3: OnYUVAggressive, 4: Adaptive
    *
-   * Supported in codecs: AOM
+   * Supported in codecs: VP8
    */
   AOME_SET_NOISE_SENSITIVITY,
 
   /*!\brief Codec control function to set sharpness.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_SHARPNESS,
 
   /*!\brief Codec control function to set the threshold for MBs treated static.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_STATIC_THRESHOLD,
 
   /*!\brief Codec control function to set the number of token partitions.
    *
-   * Supported in codecs: AOM
+   * Supported in codecs: VP8
    */
   AOME_SET_TOKEN_PARTITIONS,
 
@@ -213,7 +198,7 @@
    *
    * Return value uses internal quantizer scale defined by the codec.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_GET_LAST_QUANTIZER,
 
@@ -222,19 +207,19 @@
    * Return value uses the 0..63 scale as used by the rc_*_quantizer config
    * parameters.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_GET_LAST_QUANTIZER_64,
 
   /*!\brief Codec control function to set the max no of frames to create arf.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_ARNR_MAXFRAMES,
 
   /*!\brief Codec control function to set the filter strength for the arf.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_ARNR_STRENGTH,
 
@@ -243,7 +228,7 @@
 
   /*!\brief Codec control function to set visual tuning.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_TUNING,
 
@@ -253,7 +238,7 @@
    *            set to #AOM_CQ.
    * \note Valid range: 0..63
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_CQ_LEVEL,
 
@@ -268,13 +253,13 @@
    * For example, to allocate no more than 4.5 frames worth of bitrate
    * to a keyframe, set this to 450.
    *
-   * Supported in codecs: AOM, AV1
+   * Supported in codecs: VP8, AV1
    */
   AOME_SET_MAX_INTRA_BITRATE_PCT,
 
   /*!\brief Codec control function to set reference and update frame flags.
    *
-   *  Supported in codecs: AOM
+   *  Supported in codecs: VP8
    */
   AOME_SET_FRAME_FLAGS,
 
@@ -308,22 +293,11 @@
    */
   AV1E_SET_GF_CBR_BOOST_PCT,
 
-  /*!\brief Codec control function to set the temporal layer id.
-   *
-   * For temporal scalability: this control allows the application to set the
-   * layer id for each frame to be encoded. Note that this control must be set
-   * for every frame prior to encoding. The usage of this control function
-   * supersedes the internal temporal pattern counter, which is now deprecated.
-   *
-   * Supported in codecs: AOM
-   */
-  AOME_SET_TEMPORAL_LAYER_ID,
-
   /*!\brief Codec control function to set encoder screen content mode.
    *
    * 0: off, 1: On, 2: On with more aggressive rate control.
    *
-   * Supported in codecs: AOM
+   * Supported in codecs: VP8
    */
   AOME_SET_SCREEN_CONTENT_MODE,
 
@@ -486,14 +460,6 @@
    */
   AV1E_SET_TUNE_CONTENT,
 
-  /*!\brief Codec control function to register callback to get per layer packet.
-   * \note Parameter for this control function is a structure with a callback
-   *       function and a pointer to private data used by the callback.
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_REGISTER_CX_CALLBACK,
-
   /*!\brief Codec control function to set color space info.
    * \note Valid ranges: 0..7, default is "UNKNOWN".
    *                     0 = UNKNOWN,
@@ -509,18 +475,6 @@
    */
   AV1E_SET_COLOR_SPACE,
 
-  /*!\brief Codec control function to set temporal layering mode.
-   * \note Valid ranges: 0..3, default is "0"
-   * (AV1E_TEMPORAL_LAYERING_MODE_NOLAYERING).
-   *                     0 = AV1E_TEMPORAL_LAYERING_MODE_NOLAYERING
-   *                     1 = AV1E_TEMPORAL_LAYERING_MODE_BYPASS
-   *                     2 = AV1E_TEMPORAL_LAYERING_MODE_0101
-   *                     3 = AV1E_TEMPORAL_LAYERING_MODE_0212
-   *
-   * Supported in codecs: AV1
-   */
-  AV1E_SET_TEMPORAL_LAYERING_MODE,
-
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
    * By default the value is set as 4.
@@ -559,6 +513,30 @@
    * Supported in codecs: AV1
    */
   AV1E_SET_RENDER_SIZE,
+
+  /*!\brief Codec control function to set target level.
+   *
+   * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
+   * 11: target for level 1.1; ... 62: target for level 6.2
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_TARGET_LEVEL,
+
+  /*!\brief Codec control function to get bitstream level.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_GET_LEVEL,
+
+  /*!\brief Codec control function to set intended superblock size.
+   *
+   * By default, the superblock size is determined separately for each
+   * frame by the encoder.
+   *
+   * Supported in codecs: AV1
+   */
+  AV1E_SET_SUPERBLOCK_SIZE,
 };
 
 /*!\brief aom 1-D scaling mode
@@ -599,9 +577,8 @@
  */
 
 typedef struct aom_active_map {
-  unsigned char
-      *active_map; /**< specify an on (1) or off (0) each 16x16 region within a
-                      frame */
+  /*!\brief specify an on (1) or off (0) each 16x16 region within a frame */
+  unsigned char *active_map;
   unsigned int rows; /**< number of rows */
   unsigned int cols; /**< number of cols */
 } aom_active_map_t;
@@ -616,9 +593,9 @@
   AOM_SCALING_MODE v_scaling_mode; /**< vertical scaling mode   */
 } aom_scaling_mode_t;
 
-/*!\brief AOM token partition mode
+/*!\brief VP8 token partition mode
  *
- * This defines AOM partitioning mode for compressed data, i.e., the number of
+ * This defines VP8 partitioning mode for compressed data, i.e., the number of
  * sub-streams in the bitstream. Used for parallelized decoding.
  *
  */
@@ -637,7 +614,7 @@
   AOM_CONTENT_INVALID
 } aom_tune_content;
 
-/*!\brief AOM model tuning parameters
+/*!\brief VP8 model tuning parameters
  *
  * Changes the encoder to tune for certain types of input material.
  *
@@ -645,17 +622,17 @@
 typedef enum { AOM_TUNE_PSNR, AOM_TUNE_SSIM } aom_tune_metric;
 
 /*!\cond */
-/*!\brief AOM encoder control function parameter type
+/*!\brief VP8 encoder control function parameter type
  *
- * Defines the data types that AOME control functions take. Note that
+ * Defines the data types that VP8E control functions take. Note that
  * additional common controls are defined in aom.h
  *
  */
 
+AOM_CTRL_USE_TYPE_DEPRECATED(AOME_USE_REFERENCE, int)
+#define AOM_CTRL_AOME_USE_REFERENCE
 AOM_CTRL_USE_TYPE(AOME_SET_FRAME_FLAGS, int)
 #define AOM_CTRL_AOME_SET_FRAME_FLAGS
-AOM_CTRL_USE_TYPE(AOME_SET_TEMPORAL_LAYER_ID, int)
-#define AOM_CTRL_AOME_SET_TEMPORAL_LAYER_ID
 AOM_CTRL_USE_TYPE(AOME_SET_ROI_MAP, aom_roi_map_t *)
 #define AOM_CTRL_AOME_SET_ROI_MAP
 AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
@@ -663,13 +640,16 @@
 AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
 #define AOM_CTRL_AOME_SET_SCALEMODE
 
-AOM_CTRL_USE_TYPE(AV1E_REGISTER_CX_CALLBACK, void *)
-#define AOM_CTRL_AV1E_REGISTER_CX_CALLBACK
-
 AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
 #define AOM_CTRL_AOME_SET_CPUUSED
 AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
 #define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
+
+#if CONFIG_EXT_REFS
+AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOBWDREF, unsigned int)
+#define AOM_CTRL_AOME_SET_ENABLEAUTOBWDREF
+#endif  // CONFIG_EXT_REFS
+
 AOM_CTRL_USE_TYPE(AOME_SET_NOISE_SENSITIVITY, unsigned int)
 #define AOM_CTRL_AOME_SET_NOISE_SENSITIVITY
 AOM_CTRL_USE_TYPE(AOME_SET_SHARPNESS, unsigned int)
@@ -755,11 +735,23 @@
 AOM_CTRL_USE_TYPE(AV1E_SET_COLOR_RANGE, int)
 #define AOM_CTRL_AV1E_SET_COLOR_RANGE
 
-AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
+/*!\brief
+ *
+ * TODO(rbultje) : add support of the control in ffmpeg
+ */
 #define AOM_CTRL_AV1E_SET_RENDER_SIZE
+AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
 
+AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
+#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
+
+AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
+#define AOM_CTRL_AV1E_GET_LEVEL
 /*!\endcond */
-/*! @} - end defgroup aom_encoder */
+/*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/aom/aomdx.h b/aom/aomdx.h
index 9900000..19256fb 100644
--- a/aom/aomdx.h
+++ b/aom/aomdx.h

@@ -37,6 +37,8 @@
 extern aom_codec_iface_t *aom_codec_av1_dx(void);
 /*!@} - end algorithm interface member group*/
 
+/** Data structure that stores bit accounting for debug
+ */
 typedef struct Accounting Accounting;
 
 /*!\enum aom_dec_control_id
@@ -113,7 +115,16 @@
    */
   AV1_GET_ACCOUNTING,
 
-  AOM_DECODER_CTRL_ID_MAX
+  AOM_DECODER_CTRL_ID_MAX,
+
+  /** control function to set the range of tile decoding. A value that is
+   * greater and equal to zero indicates only the specific row/column is
+   * decoded. A value that is -1 indicates the whole row/column is decoded.
+   * A special case is both values are -1 that means the whole frame is
+   * decoded.
+   */
+  AV1_SET_DECODE_TILE_ROW,
+  AV1_SET_DECODE_TILE_COL
 };
 
 /** Decrypt n bytes of data from input -> output, using the decrypt_state
@@ -166,7 +177,10 @@
 #define AOM_CTRL_AV1_INVERT_TILE_DECODE_ORDER
 AOM_CTRL_USE_TYPE(AV1_GET_ACCOUNTING, Accounting **)
 #define AOM_CTRL_AV1_GET_ACCOUNTING
-
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_ROW, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_ROW
+AOM_CTRL_USE_TYPE(AV1_SET_DECODE_TILE_COL, int)
+#define AOM_CTRL_AV1_SET_DECODE_TILE_COL
 /*!\endcond */
 /*! @} - end defgroup aom_decoder */
 

diff --git a/aom/exports_enc b/aom/exports_enc
index 4932c26..0dcca7d 100644
--- a/aom/exports_enc
+++ b/aom/exports_enc

@@ -7,9 +7,3 @@
 text aom_codec_get_global_headers
 text aom_codec_get_preview_frame
 text aom_codec_set_cx_data_buf
-text aom_svc_dump_statistics
-text aom_svc_encode
-text aom_svc_get_message
-text aom_svc_init
-text aom_svc_release
-text aom_svc_set_options

diff --git a/aom_dsp/add_noise.c b/aom_dsp/add_noise.c
new file mode 100644
index 0000000..2b281b7
--- /dev/null
+++ b/aom_dsp/add_noise.c

@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
+                           char whiteclamp[16], char bothclamp[16],
+                           unsigned int width, unsigned int height, int pitch) {
+  unsigned int i, j;
+
+  for (i = 0; i < height; ++i) {
+    uint8_t *pos = start + i * pitch;
+    char *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
+
+    for (j = 0; j < width; ++j) {
+      int v = pos[j];
+
+      v = clamp(v - blackclamp[0], 0, 255);
+      v = clamp(v + bothclamp[0], 0, 255);
+      v = clamp(v - whiteclamp[0], 0, 255);
+
+      pos[j] = v + ref[j];
+    }
+  }
+}
+
+static double gaussian(double sigma, double mu, double x) {
+  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
+}
+
+int aom_setup_noise(double sigma, int size, char *noise) {
+  char char_dist[256];
+  int next = 0, i, j;
+
+  // set up a 256 entry lookup that matches gaussian distribution
+  for (i = -32; i < 32; ++i) {
+    const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+    if (a_i) {
+      for (j = 0; j < a_i; ++j) {
+        char_dist[next + j] = (char)i;
+      }
+      next = next + j;
+    }
+  }
+
+  // Rounding error - might mean we have less than 256.
+  for (; next < 256; ++next) {
+    char_dist[next] = 0;
+  }
+
+  for (i = 0; i < size; ++i) {
+    noise[i] = char_dist[rand() & 0xff];  // NOLINT
+  }
+
+  // Returns the highest non 0 value used in distribution.
+  return -char_dist[0];
+}

diff --git a/aom_dsp/ansreader.h b/aom_dsp/ansreader.h
index c46778b..0e9a671 100644
--- a/aom_dsp/ansreader.h
+++ b/aom_dsp/ansreader.h

@@ -20,6 +20,9 @@
 #include "aom_dsp/prob.h"
 #include "aom_dsp/ans.h"
 #include "aom_ports/mem_ops.h"
+#if CONFIG_ACCOUNTING
+#include "av1/common/accounting.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,6 +32,9 @@
   const uint8_t *buf;
   int buf_offset;
   uint32_t state;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
 };
 
 static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
@@ -119,6 +125,9 @@
     // 110xxxxx implies this byte is a superframe marker
     return 1;
   }
+#if CONFIG_ACCOUNTING
+  ans->accounting = NULL;
+#endif
   ans->state += L_BASE;
   if (ans->state >= L_BASE * IO_BASE) return 1;
   return 0;

diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index d74957b..a079d1b 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c

@@ -129,19 +129,21 @@
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
+  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+                 intermediate_height);
+  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
+                dst_stride, y_filters, y0_q4, y_step_q4, w, h);
 }
 
 static const InterpKernel *get_filter_base(const int16_t *filter) {
@@ -233,13 +235,14 @@
                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
 
-  aom_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
-                  y_step_q4, w, h);
-  aom_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
+                  filter_y, y_step_q4, w, h);
+  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
+                     h);
 }
 
 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
@@ -447,21 +450,21 @@
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[64 * 135];
+  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        CONVERT_TO_BYTEPTR(temp), 64, x_filters, x0_q4,
+                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
                         x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h,
-                       bd);
+  highbd_convolve_vert(
+      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
 }
 
 void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -541,14 +544,14 @@
                                 const int16_t *filter_y, int y_step_q4, int w,
                                 int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
 
-  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL,
-                            0, NULL, 0, w, h, bd);
+  aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
+                            dst_stride, NULL, 0, NULL, 0, w, h, bd);
 }
 
 void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,

diff --git a/aom_dsp/aom_convolve.h b/aom_dsp/aom_convolve.h
index 3a33fe0..75d24a4 100644
--- a/aom_dsp/aom_convolve.h
+++ b/aom_dsp/aom_convolve.h

@@ -18,6 +18,24 @@
 extern "C" {
 #endif
 
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 64x64 pixels.
+// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define MAX_EXT_SIZE 263
+#else
+#define MAX_EXT_SIZE 135
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,

diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index 34be453..07fbe02 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk

@@ -191,20 +191,24 @@
 endif  # CONFIG_AOM_HIGHBITDEPTH
 
 DSP_SRCS-yes            += txfm_common.h
+DSP_SRCS-yes            += x86/txfm_common_intrin.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
 # forward transform
-ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
+ifeq ($(CONFIG_AV1),yes)
 DSP_SRCS-yes            += fwd_txfm.c
 DSP_SRCS-yes            += fwd_txfm.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32_8cols_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_txfm_impl_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.h
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/txfm_common_avx2.h
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
@@ -231,7 +235,7 @@
 endif  # CONFIG_PVQ
 
 # inverse transform
-ifneq ($(filter yes,$(CONFIG_AV1)),)
+ifeq ($(CONFIG_AV1), yes)
 DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
@@ -282,7 +286,7 @@
 endif  # CONFIG_AV1
 
 # quantization
-ifneq ($(filter yes, $(CONFIG_AV1_ENCODER)),)
+ifneq ($(filter yes,$(CONFIG_AV1_ENCODER)),)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
@@ -300,12 +304,24 @@
 DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
 
+# high bit depth subtract
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
+endif
+
 endif  # CONFIG_AV1_ENCODER
 
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+DSP_SRCS-yes            += sum_squares.c
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
+endif # CONFIG_AV1_ENCODER
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
@@ -324,6 +340,17 @@
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 
+ifeq ($(CONFIG_AV1_ENCODER),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
+endif  #CONFIG_EXT_INTER
+ifeq ($(CONFIG_MOTION_VAR),yes)
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
+endif  #CONFIG_MOTION_VAR
+endif  #CONFIG_AV1_ENCODER
+
 DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
@@ -369,14 +396,10 @@
 
 ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
 endif  # CONFIG_AOM_HIGHBITDEPTH
-
-ifeq ($(CONFIG_MOTION_VAR),yes)
-DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
-endif  #CONFIG_MOTION_VAR
 endif  # CONFIG_ENCODERS
 
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)

diff --git a/aom_dsp/aom_dsp_common.h b/aom_dsp/aom_dsp_common.h
index 6498c46..a7ea1b9 100644
--- a/aom_dsp/aom_dsp_common.h
+++ b/aom_dsp/aom_dsp_common.h

@@ -21,31 +21,16 @@
 #endif
 
 #ifndef MAX_SB_SIZE
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE 128
+#else
 #define MAX_SB_SIZE 64
-#endif
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+#endif  // ndef MAX_SB_SIZE
 
 #define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
 
-#if CONFIG_AOM_QM
-typedef uint16_t qm_val_t;
-#define AOM_QM_BITS 6
-#endif
-
-#if CONFIG_AOM_HIGHBITDEPTH
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-#else
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int32_t tran_high_t;
-typedef int16_t tran_low_t;
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
 #define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
 
 #define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
@@ -62,6 +47,31 @@
 #define UNLIKELY(v) (v)
 #endif
 
+#define AOM_SWAP(type, a, b) \
+  do {                       \
+    type c = (b);            \
+    b = a;                   \
+    a = c;                   \
+  } while (0)
+
+#if CONFIG_AOM_QM
+typedef uint16_t qm_val_t;
+#define AOM_QM_BITS 6
+#endif
+#if CONFIG_AOM_HIGHBITDEPTH
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 53af943..94e2587 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -27,7 +27,12 @@
   $avx2_x86_64 = 'avx2';
 }
 
-@block_widths = (4, 8, 16, 32, 64);
+if (aom_config("CONFIG_EXT_PARTITION") eq "yes") {
+  @block_widths = (4, 8, 16, 32, 64, 128)
+} else {
+  @block_widths = (4, 8, 16, 32, 64)
+}
+
 @block_sizes = ();
 foreach $w (@block_widths) {
   foreach $h (@block_widths) {
@@ -90,12 +95,21 @@
 add_proto qw/void aom_d153_predictor_2x2/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d153_predictor_2x2/;
 
+add_proto qw/void aom_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d207_predictor_4x4 sse2/;
+
 add_proto qw/void aom_d207e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207e_predictor_4x4/;
 
+add_proto qw/void aom_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d45_predictor_4x4 neon sse2/;
+
 add_proto qw/void aom_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d45e_predictor_4x4/;
 
+add_proto qw/void aom_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d63_predictor_4x4 ssse3/;
+
 add_proto qw/void aom_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d63e_predictor_4x4/;
 
@@ -123,8 +137,12 @@
 add_proto qw/void aom_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_ve_predictor_4x4/;
 
-add_proto qw/void aom_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_4x4 neon dspr2 msa sse2/;
+if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+  add_proto qw/void aom_paeth_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+} else {
+  add_proto qw/void aom_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_tm_predictor_4x4 neon dspr2 msa sse2/;
+}  # CONFIG_ALT_INTRA
 
 add_proto qw/void aom_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
@@ -138,12 +156,21 @@
 add_proto qw/void aom_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
 
+add_proto qw/void aom_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d207_predictor_8x8 ssse3/;
+
 add_proto qw/void aom_d207e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207e_predictor_8x8/;
 
+add_proto qw/void aom_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d45_predictor_8x8 neon sse2/;
+
 add_proto qw/void aom_d45e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d45e_predictor_8x8/;
 
+add_proto qw/void aom_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d63_predictor_8x8 ssse3/;
+
 add_proto qw/void aom_d63e_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d63e_predictor_8x8/;
 
@@ -162,8 +189,12 @@
 add_proto qw/void aom_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_v_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void aom_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_8x8 neon dspr2 msa sse2/;
+if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+  add_proto qw/void aom_paeth_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+} else {
+  add_proto qw/void aom_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_tm_predictor_8x8 neon dspr2 msa sse2/;
+}  # CONFIG_ALT_INTRA
 
 add_proto qw/void aom_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
@@ -177,12 +208,21 @@
 add_proto qw/void aom_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
 
+add_proto qw/void aom_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d207_predictor_16x16 ssse3/;
+
 add_proto qw/void aom_d207e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207e_predictor_16x16/;
 
+add_proto qw/void aom_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d45_predictor_16x16 neon ssse3/;
+
 add_proto qw/void aom_d45e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d45e_predictor_16x16/;
 
+add_proto qw/void aom_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d63_predictor_16x16 ssse3/;
+
 add_proto qw/void aom_d63e_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d63e_predictor_16x16/;
 
@@ -201,8 +241,13 @@
 add_proto qw/void aom_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_v_predictor_16x16 neon msa sse2/;
 
-add_proto qw/void aom_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_16x16 neon msa sse2/;
+if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+  add_proto qw/void aom_paeth_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+} else {
+  add_proto qw/void aom_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_tm_predictor_16x16 neon msa sse2/;
+}  # CONFIG_ALT_INTRA
+
 
 add_proto qw/void aom_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
@@ -216,12 +261,21 @@
 add_proto qw/void aom_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
 
+add_proto qw/void aom_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d207_predictor_32x32 ssse3/;
+
 add_proto qw/void aom_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d207e_predictor_32x32/;
 
+add_proto qw/void aom_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d45_predictor_32x32 ssse3/;
+
 add_proto qw/void aom_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d45e_predictor_32x32/;
 
+add_proto qw/void aom_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+specialize qw/aom_d63_predictor_32x32 ssse3/;
+
 add_proto qw/void aom_d63e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_d63e_predictor_32x32/;
 
@@ -240,8 +294,13 @@
 add_proto qw/void aom_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_v_predictor_32x32 neon msa sse2/;
 
-add_proto qw/void aom_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/aom_tm_predictor_32x32 neon msa sse2/;
+if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+  add_proto qw/void aom_paeth_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+} else {
+  add_proto qw/void aom_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+  specialize qw/aom_tm_predictor_32x32 neon msa sse2/;
+}  # CONFIG_ALT_INTRA
+
 
 add_proto qw/void aom_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/aom_dc_predictor_32x32 msa neon sse2/;
@@ -257,12 +316,21 @@
 
 # High bitdepth functions
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d207_predictor_4x4/;
+
   add_proto qw/void aom_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d207e_predictor_4x4/;
 
+  add_proto qw/void aom_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d45_predictor_4x4/;
+
   add_proto qw/void aom_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d45e_predictor_4x4/;
 
+  add_proto qw/void aom_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d63_predictor_4x4/;
+
   add_proto qw/void aom_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d63e_predictor_4x4/;
 
@@ -281,8 +349,13 @@
   add_proto qw/void aom_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_v_predictor_4x4 sse2/;
 
-  add_proto qw/void aom_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_4x4 sse2/;
+  if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+    add_proto qw/void aom_highbd_paeth_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  } else {
+    add_proto qw/void aom_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_tm_predictor_4x4 sse2/;
+  }  # CONFIG_ALT_INTRA
+
 
   add_proto qw/void aom_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_predictor_4x4 sse2/;
@@ -296,12 +369,21 @@
   add_proto qw/void aom_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_128_predictor_4x4/;
 
+  add_proto qw/void aom_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d207_predictor_8x8/;
+
   add_proto qw/void aom_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d207e_predictor_8x8/;
 
+  add_proto qw/void aom_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d45_predictor_8x8/;
+
   add_proto qw/void aom_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d45e_predictor_8x8/;
 
+  add_proto qw/void aom_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d63_predictor_8x8/;
+
   add_proto qw/void aom_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d63e_predictor_8x8/;
 
@@ -320,8 +402,12 @@
   add_proto qw/void aom_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_v_predictor_8x8 sse2/;
 
-  add_proto qw/void aom_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_8x8 sse2/;
+  if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+    add_proto qw/void aom_highbd_paeth_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  } else {
+    add_proto qw/void aom_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_tm_predictor_8x8 sse2/;
+  }  # CONFIG_ALT_INTRA
 
   add_proto qw/void aom_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_predictor_8x8 sse2/;;
@@ -335,12 +421,21 @@
   add_proto qw/void aom_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_128_predictor_8x8/;
 
+  add_proto qw/void aom_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d207_predictor_16x16/;
+
   add_proto qw/void aom_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d207e_predictor_16x16/;
 
+  add_proto qw/void aom_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d45_predictor_16x16/;
+
   add_proto qw/void aom_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d45e_predictor_16x16/;
 
+  add_proto qw/void aom_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d63_predictor_16x16/;
+
   add_proto qw/void aom_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d63e_predictor_16x16/;
 
@@ -359,8 +454,12 @@
   add_proto qw/void aom_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_v_predictor_16x16 sse2/;
 
-  add_proto qw/void aom_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_16x16 sse2/;
+  if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+    add_proto qw/void aom_highbd_paeth_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  } else {
+    add_proto qw/void aom_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_tm_predictor_16x16 sse2/;
+  }  # CONFIG_ALT_INTRA
 
   add_proto qw/void aom_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_predictor_16x16 sse2/;
@@ -374,12 +473,21 @@
   add_proto qw/void aom_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_128_predictor_16x16/;
 
+  add_proto qw/void aom_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d207_predictor_32x32/;
+
   add_proto qw/void aom_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d207e_predictor_32x32/;
 
+  add_proto qw/void aom_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d45_predictor_32x32/;
+
   add_proto qw/void aom_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d45e_predictor_32x32/;
 
+  add_proto qw/void aom_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/aom_highbd_d63_predictor_32x32/;
+
   add_proto qw/void aom_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_d63e_predictor_32x32/;
 
@@ -398,8 +506,12 @@
   add_proto qw/void aom_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_v_predictor_32x32 sse2/;
 
-  add_proto qw/void aom_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/aom_highbd_tm_predictor_32x32 sse2/;
+  if ((aom_config("CONFIG_ALT_INTRA") eq "yes")) {
+    add_proto qw/void aom_highbd_paeth_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  } else {
+    add_proto qw/void aom_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/aom_highbd_tm_predictor_32x32 sse2/;
+  }  # CONFIG_ALT_INTRA
 
   add_proto qw/void aom_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/aom_highbd_dc_predictor_32x32 sse2/;
@@ -417,52 +529,44 @@
 #
 # Sub Pixel Filters
 #
-add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve_copy neon dspr2 msa sse2/;
-
-add_proto qw/void aom_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve_avg neon dspr2 msa sse2/;
-
-add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void aom_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
+add_proto qw/void aom_convolve_copy/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_avg/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_horiz/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_vert/,      "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_avg/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void aom_convolve8_avg_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_2d/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_horiz/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_vert/,         "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_avg_2d/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_avg_horiz/,    "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_scaled_avg_vert/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-add_proto qw/void aom_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+specialize qw/aom_convolve_copy       sse2      /;
+specialize qw/aom_convolve_avg        sse2      /;
+specialize qw/aom_convolve8           sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_avg       sse2 ssse3/;
+specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
+specialize qw/aom_convolve8_avg_vert  sse2 ssse3/;
+specialize qw/aom_scaled_2d                ssse3/;
 
-add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_2d ssse3/;
-
-add_proto qw/void aom_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_horiz/;
-
-add_proto qw/void aom_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_vert/;
-
-add_proto qw/void aom_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_avg_2d/;
-
-add_proto qw/void aom_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_avg_horiz/;
-
-add_proto qw/void aom_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/aom_scaled_avg_vert/;
+# TODO(any): These need to be extended to up to 128x128 block sizes
+if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) {
+  specialize qw/aom_convolve_copy       neon dspr2 msa/;
+  specialize qw/aom_convolve_avg        neon dspr2 msa/;
+  specialize qw/aom_convolve8           neon dspr2 msa/;
+  specialize qw/aom_convolve8_horiz     neon dspr2 msa/;
+  specialize qw/aom_convolve8_vert      neon dspr2 msa/;
+  specialize qw/aom_convolve8_avg       neon dspr2 msa/;
+  specialize qw/aom_convolve8_avg_horiz neon dspr2 msa/;
+  specialize qw/aom_convolve8_avg_vert  neon dspr2 msa/;
+}
 
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  #
-  # Sub Pixel Filters
-  #
   add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/aom_highbd_convolve_copy sse2/;
 
@@ -596,85 +700,86 @@
 # Forward transform
 #
 if ((aom_config("CONFIG_AV1_ENCODER") eq "yes") || (aom_config("CONFIG_PVQ") eq "yes")){
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4 sse2/;
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4 sse2/;
 
-  add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4_1 sse2/;
+    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4_1 sse2/;
 
-  add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8 sse2/;
+    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
 
-  add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8_1 sse2/;
+    add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8_1 sse2/;
 
-  add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16 sse2/;
+    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16 sse2/;
 
-  add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16_1 sse2/;
+    add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16_1 sse2 avx2/;
 
-  add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32 sse2/;
+    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32 sse2 avx2/;
 
-  add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_rd sse2/;
+    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_rd sse2 avx2/;
 
-  add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_1 sse2/;
+    add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_1 sse2 avx2/;
 
-  add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct4x4 sse2/;
+    # High bit depth
+    add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct4x4 sse2/;
 
-  add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct8x8 sse2/;
+    add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct8x8 sse2/;
 
-  add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct8x8_1/;
+    add_proto qw/void aom_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct8x8_1/;
 
-  add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct16x16 sse2/;
+    add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct16x16 sse2/;
 
-  add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct16x16_1/;
+    add_proto qw/void aom_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct16x16_1/;
 
-  add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32 sse2/;
+    add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32 sse2/;
 
-  add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32_rd sse2/;
+    add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32_rd sse2/;
 
-  add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_highbd_fdct32x32_1/;
-} else {
-  add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4 sse2 msa/;
+    add_proto qw/void aom_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_highbd_fdct32x32_1/;
+  } else {
+    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4 sse2 msa/;
 
-  add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct4x4_1 sse2/;
+    add_proto qw/void aom_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4_1 sse2/;
 
-  add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
 
-  add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct8x8_1 sse2 neon msa/;
+    add_proto qw/void aom_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct8x8_1 sse2 neon msa/;
 
-  add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16 sse2 msa/;
+    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16 sse2 msa/;
 
-  add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct16x16_1 sse2 msa/;
+    add_proto qw/void aom_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct16x16_1 sse2 avx2 msa/;
 
-  add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32 sse2 avx2 msa/;
+    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32 sse2 avx2 msa/;
 
-  add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
+    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_rd sse2 avx2 msa/;
 
-  add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/aom_fdct32x32_1 sse2 msa/;
-}  # CONFIG_AOM_HIGHBITDEPTH
+    add_proto qw/void aom_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct32x32_1 sse2 avx2 msa/;
+  }  # CONFIG_AOM_HIGHBITDEPTH
 }  # CONFIG_AV1_ENCODER
 
 #
@@ -900,9 +1005,9 @@
     $aom_idct32x32_135_add_msa=aom_idct32x32_1024_add_msa;
 
     add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/aom_idct32x32_34_add sse2 neon_asm dspr2 msa/, "$ssse3_x86_64";
+    specialize qw/aom_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
     # Need to add 34 eob idct32x32 neon implementation.
-    $aom_idct32x32_34_add_neon_asm=aom_idct32x32_1024_add_neon;
+    $aom_idct32x32_34_add_neon=aom_idct32x32_1024_add_neon;
 
     add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
     specialize qw/aom_idct32x32_1_add sse2 neon dspr2 msa/;
@@ -948,25 +1053,26 @@
     }  # CONFIG_AOM_HIGHBITDEPTH
   }  # CONFIG_AV1_ENCODER
 } # CONFIG_AOM_QM
+if (aom_config("CONFIG_AV1") eq "yes") {
+  #
+  # Alpha blending with mask
+  #
+  add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+  add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  specialize "aom_blend_a64_mask", qw/sse4_1/;
+  specialize "aom_blend_a64_hmask", qw/sse4_1/;
+  specialize "aom_blend_a64_vmask", qw/sse4_1/;
 
-#
-# Alpha blending with mask
-#
-add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
-add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
-add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
-specialize "aom_blend_a64_mask", qw/sse4_1/;
-specialize "aom_blend_a64_hmask", qw/sse4_1/;
-specialize "aom_blend_a64_vmask", qw/sse4_1/;
-
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";  
-  add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
-  add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
-  specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-}
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+    add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
+    specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
+    specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
+  }
+}  # CONFIG_AV1
 
 if (aom_config("CONFIG_ENCODERS") eq "yes") {
 #
@@ -975,47 +1081,140 @@
 add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/aom_subtract_block neon msa sse2/;
 
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
 #
-# Single block SAD
+# Sum of Squares
 #
-add_proto qw/unsigned int aom_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad64x64 avx2 neon msa sse2/;
+add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
+specialize qw/aom_sum_squares_2d_i16 sse2/;
 
-add_proto qw/unsigned int aom_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad64x32 avx2 msa sse2/;
+add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+specialize qw/aom_sum_squares_i16 sse2/;
+}
 
-add_proto qw/unsigned int aom_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad32x64 avx2 msa sse2/;
 
-add_proto qw/unsigned int aom_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad32x32 avx2 neon msa sse2/;
+#
+# Avg
+#
+if ((aom_config("CONFIG_AV1_ENCODER") eq "yes")) {
+  #
+  # Avg
+  #
+  add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/aom_avg_8x8 sse2 neon msa/;
+  add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/aom_avg_4x4 sse2 neon msa/;
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
+    specialize qw/aom_highbd_avg_8x8/;
+    add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
+    specialize qw/aom_highbd_avg_4x4/;
+    add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+    specialize qw/aom_highbd_subtract_block sse2/;
+  }
 
-add_proto qw/unsigned int aom_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad32x16 avx2 msa sse2/;
+  #
+  # Minmax
+  #
+  add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/aom_minmax_8x8 sse2 neon/;
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+    specialize qw/aom_highbd_minmax_8x8/;
+  }
 
-add_proto qw/unsigned int aom_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad16x32 msa sse2/;
+  add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
 
-add_proto qw/unsigned int aom_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad16x16 media neon msa sse2/;
+  add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_16x16 sse2 neon/;
 
-add_proto qw/unsigned int aom_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad16x8 neon msa sse2/;
+  add_proto qw/int aom_satd/, "const int16_t *coeff, int length";
+  specialize qw/aom_satd sse2 neon/;
 
-add_proto qw/unsigned int aom_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad8x16 neon msa sse2/;
+  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+  specialize qw/aom_int_pro_row sse2 neon/;
 
-add_proto qw/unsigned int aom_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad8x8 neon msa sse2/;
+  add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
+  specialize qw/aom_int_pro_col sse2 neon/;
 
-add_proto qw/unsigned int aom_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad8x4 msa sse2/;
+  add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+  specialize qw/aom_vector_var neon sse2/;
+}  # CONFIG_AV1_ENCODER
 
-add_proto qw/unsigned int aom_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad4x8 msa sse2/;
+#
+# Single block SAD / Single block Avg SAD
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+}
 
-add_proto qw/unsigned int aom_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/aom_sad4x4 neon msa sse2/;
+specialize qw/aom_sad128x128                        sse2/;
+specialize qw/aom_sad128x64                         sse2/;
+specialize qw/aom_sad64x128                         sse2/;
+specialize qw/aom_sad64x64      avx2       neon msa sse2/;
+specialize qw/aom_sad64x32      avx2            msa sse2/;
+specialize qw/aom_sad32x64      avx2            msa sse2/;
+specialize qw/aom_sad32x32      avx2       neon msa sse2/;
+specialize qw/aom_sad32x16      avx2            msa sse2/;
+specialize qw/aom_sad16x32                      msa sse2/;
+specialize qw/aom_sad16x16           media neon msa sse2/;
+specialize qw/aom_sad16x8                  neon msa sse2/;
+specialize qw/aom_sad8x16                  neon msa sse2/;
+specialize qw/aom_sad8x8                   neon msa sse2/;
+specialize qw/aom_sad8x4                        msa sse2/;
+specialize qw/aom_sad4x8                        msa sse2/;
+specialize qw/aom_sad4x4                   neon msa sse2/;
+
+specialize qw/aom_sad128x128_avg          sse2/;
+specialize qw/aom_sad128x64_avg           sse2/;
+specialize qw/aom_sad64x128_avg           sse2/;
+specialize qw/aom_sad64x64_avg   avx2 msa sse2/;
+specialize qw/aom_sad64x32_avg   avx2 msa sse2/;
+specialize qw/aom_sad32x64_avg   avx2 msa sse2/;
+specialize qw/aom_sad32x32_avg   avx2 msa sse2/;
+specialize qw/aom_sad32x16_avg   avx2 msa sse2/; 
+specialize qw/aom_sad16x32_avg        msa sse2/;
+specialize qw/aom_sad16x16_avg        msa sse2/;
+specialize qw/aom_sad16x8_avg         msa sse2/;
+specialize qw/aom_sad8x16_avg         msa sse2/;
+specialize qw/aom_sad8x8_avg          msa sse2/;
+specialize qw/aom_sad8x4_avg          msa sse2/;
+specialize qw/aom_sad4x8_avg          msa sse2/;
+specialize qw/aom_sad4x4_avg          msa sse2/;
+
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    if ($w != 128 && $h != 128 && $w != 4) {
+      specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
+      specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
+    }
+  }
+}
+
+#
+# Masked SAD
+#
+if (aom_config("CONFIG_EXT_INTER") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+    specialize "aom_masked_sad${w}x${h}", qw/ssse3/;
+  }
+
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3/;
+    }
+  }
+}
 
 #
 # OBMC SAD
@@ -1037,6 +1236,277 @@
 }
 
 #
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+# Blocks of 3
+foreach $s (@block_widths) {
+  add_proto qw/void/, "aom_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/aom_sad64x64x3            msa/;
+specialize qw/aom_sad32x32x3            msa/;
+specialize qw/aom_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/aom_sad8x8x3   sse3       msa/;
+specialize qw/aom_sad4x4x3   sse3       msa/;
+
+add_proto qw/void/, "aom_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad16x8x3 sse3 ssse3 msa/;
+add_proto qw/void/, "aom_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad8x16x3 sse3 msa/;
+
+# Blocks of 8
+foreach $s (@block_widths) {
+  add_proto qw/void/, "aom_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/aom_sad64x64x8        msa/;
+specialize qw/aom_sad32x32x8        msa/;
+specialize qw/aom_sad16x16x8 sse4_1 msa/;
+specialize qw/aom_sad8x8x8   sse4_1 msa/;
+specialize qw/aom_sad4x4x8   sse4_1 msa/;
+
+add_proto qw/void/, "aom_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad16x8x8 sse4_1 msa/;
+add_proto qw/void/, "aom_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad8x16x8 sse4_1 msa/;
+add_proto qw/void/, "aom_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad8x4x8 msa/;
+add_proto qw/void/, "aom_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/aom_sad4x8x8 msa/;
+
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  foreach $s (@block_widths) {
+    # Blocks of 3
+    add_proto qw/void/, "aom_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+    # Blocks of 8
+    add_proto qw/void/, "aom_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  }
+  # Blocks of 3
+  add_proto qw/void/, "aom_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "aom_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  # Blocks of 8
+  add_proto qw/void/, "aom_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "aom_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "aom_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "aom_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+}
+
+specialize qw/aom_sad128x128x4d              sse2/;
+specialize qw/aom_sad128x64x4d               sse2/;
+specialize qw/aom_sad64x128x4d               sse2/;
+specialize qw/aom_sad64x64x4d  avx2 neon msa sse2/;
+specialize qw/aom_sad64x32x4d            msa sse2/;
+specialize qw/aom_sad32x64x4d            msa sse2/;
+specialize qw/aom_sad32x32x4d  avx2 neon msa sse2/;
+specialize qw/aom_sad32x16x4d            msa sse2/;
+specialize qw/aom_sad16x32x4d            msa sse2/;
+specialize qw/aom_sad16x16x4d       neon msa sse2/;
+specialize qw/aom_sad16x8x4d             msa sse2/;
+specialize qw/aom_sad8x16x4d             msa sse2/;
+specialize qw/aom_sad8x8x4d              msa sse2/;
+specialize qw/aom_sad8x4x4d              msa sse2/;
+specialize qw/aom_sad4x8x4d              msa sse2/;
+specialize qw/aom_sad4x4x4d              msa sse2/;
+
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    if ($w != 128 && $h != 128) {
+      specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+    }
+  }
+}
+
+#
+# Structured Similarity (SSIM)
+#
+if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
+  add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+
+  add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  }
+}
+}  # CONFIG_ENCODERS
+
+if (aom_config("CONFIG_ENCODERS") eq "yes") {
+
+#
+# Specialty Variance
+#
+add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+specialize qw/aom_get16x16var sse2 avx2 neon msa/;
+specialize qw/aom_get8x8var   sse2      neon msa/;
+
+
+add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+specialize qw/aom_mse16x16          sse2 avx2 media neon msa/;
+specialize qw/aom_mse16x8           sse2                 msa/;
+specialize qw/aom_mse8x16           sse2                 msa/;
+specialize qw/aom_mse8x8            sse2                 msa/;
+
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+    specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
+    specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
+  }
+}
+
+#
+# ...
+#
+add_proto qw/void aom_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/aom_upsampled_pred sse2/;
+add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/aom_comp_avg_upsampled_pred sse2/;
+
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, const uint8_t *ref8, int ref_stride";
+  specialize qw/aom_highbd_upsampled_pred sse2/;
+  add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+  specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
+}
+
+#
+# ...
+#
+add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
+add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
+
+specialize qw/aom_get_mb_ss sse2 msa/;
+specialize qw/aom_get4x4sse_cs neon msa/;
+
+#
+# Variance / Subpixel Variance / Subpixel Avg Variance
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+}
+
+specialize qw/aom_variance64x64     sse2 avx2       neon msa/;
+specialize qw/aom_variance64x32     sse2 avx2       neon msa/;
+specialize qw/aom_variance32x64     sse2            neon msa/;
+specialize qw/aom_variance32x32     sse2 avx2       neon msa/;
+specialize qw/aom_variance32x16     sse2 avx2            msa/;
+specialize qw/aom_variance16x32     sse2                 msa/;
+specialize qw/aom_variance16x16     sse2 avx2 media neon msa/;
+specialize qw/aom_variance16x8      sse2            neon msa/;
+specialize qw/aom_variance8x16      sse2            neon msa/;
+specialize qw/aom_variance8x8       sse2      media neon msa/;
+specialize qw/aom_variance8x4       sse2                 msa/;
+specialize qw/aom_variance4x8       sse2                 msa/;
+specialize qw/aom_variance4x4       sse2                 msa/;
+
+specialize qw/aom_sub_pixel_variance64x64     avx2       neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance64x32                     msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance32x64                     msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance32x32     avx2       neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance32x16                     msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance16x32                     msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance16x16          media neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance16x8                      msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance8x16                      msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance8x8            media neon msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance8x4                       msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance4x8                       msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_variance4x4                       msa sse2 ssse3/;
+
+specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance64x32      msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance32x64      msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance32x16      msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance16x32      msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance16x16      msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance16x8       msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance8x16       msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance8x8        msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance8x4        msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance4x8        msa sse2 ssse3/;
+specialize qw/aom_sub_pixel_avg_variance4x4        msa sse2 ssse3/;
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+        specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
+      }
+      if ($w == 4 && $h == 4) {
+        specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
+      }
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
+      }
+      if ($w == 4 && $h == 4) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+      }
+    }
+  }
+}  # CONFIG_AOM_HIGHBITDEPTH
+
+if (aom_config("CONFIG_EXT_INTER") eq "yes") {
+#
+# Masked Variance / Masked Subpixel Variance
+#
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize "aom_masked_variance${w}x${h}", qw/ssse3/;
+    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+  }
+
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+        add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+        specialize "aom_highbd${bd}masked_variance${w}x${h}", qw/ssse3/;
+        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+      }
+    }
+  }
+}
+
+#
 # OBMC Variance / OBMC Subpixel Variance
 #
 if (aom_config("CONFIG_MOTION_VAR") eq "yes") {
@@ -1061,513 +1531,6 @@
   }
 }
 
-#
-# Avg
-#
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/aom_avg_8x8 sse2 neon msa/;
-
-  add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/aom_avg_4x4 sse2 neon msa/;
-
-  add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/aom_minmax_8x8 sse2/;
-
-  add_proto qw/void aom_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_8x8 sse2/, "$ssse3_x86_64";
-
-  add_proto qw/void aom_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
-  specialize qw/aom_hadamard_16x16 sse2/;
-
-  add_proto qw/int aom_satd/, "const int16_t *coeff, int length";
-  specialize qw/aom_satd sse2 neon/;
-
-  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
-  specialize qw/aom_int_pro_row sse2 neon/;
-
-  add_proto qw/int16_t aom_int_pro_col/, "uint8_t const *ref, const int width";
-  specialize qw/aom_int_pro_col sse2 neon/;
-
-  add_proto qw/int aom_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
-  specialize qw/aom_vector_var neon sse2/;
-}  # CONFIG_AV1_ENCODER
-
-add_proto qw/unsigned int aom_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad64x64_avg avx2 msa sse2/;
-
-add_proto qw/unsigned int aom_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad64x32_avg avx2 msa sse2/;
-
-add_proto qw/unsigned int aom_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad32x64_avg avx2 msa sse2/;
-
-add_proto qw/unsigned int aom_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad32x32_avg avx2 msa sse2/;
-
-add_proto qw/unsigned int aom_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad32x16_avg avx2 msa sse2/;
-
-add_proto qw/unsigned int aom_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad16x32_avg msa sse2/;
-
-add_proto qw/unsigned int aom_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad16x16_avg msa sse2/;
-
-add_proto qw/unsigned int aom_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad16x8_avg msa sse2/;
-
-add_proto qw/unsigned int aom_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad8x16_avg msa sse2/;
-
-add_proto qw/unsigned int aom_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad8x8_avg msa sse2/;
-
-add_proto qw/unsigned int aom_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad8x4_avg msa sse2/;
-
-add_proto qw/unsigned int aom_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad4x8_avg msa sse2/;
-
-add_proto qw/unsigned int aom_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/aom_sad4x4_avg msa sse2/;
-
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-# Blocks of 3
-add_proto qw/void aom_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad64x64x3 msa/;
-
-add_proto qw/void aom_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x32x3 msa/;
-
-add_proto qw/void aom_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x16x3 sse3 ssse3 msa/;
-
-add_proto qw/void aom_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x8x3 sse3 ssse3 msa/;
-
-add_proto qw/void aom_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x16x3 sse3 msa/;
-
-add_proto qw/void aom_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x8x3 sse3 msa/;
-
-add_proto qw/void aom_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad4x4x3 sse3 msa/;
-
-# Blocks of 8
-add_proto qw/void aom_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad64x64x8 msa/;
-
-add_proto qw/void aom_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x32x8 msa/;
-
-add_proto qw/void aom_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x16x8 sse4_1 msa/;
-
-add_proto qw/void aom_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x8x8 sse4_1 msa/;
-
-add_proto qw/void aom_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x16x8 sse4_1 msa/;
-
-add_proto qw/void aom_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void aom_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x4x8 msa/;
-
-add_proto qw/void aom_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad4x8x8 msa/;
-
-add_proto qw/void aom_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad4x4x8 sse4_1 msa/;
-
-#
-# Multi-block SAD, comparing a reference to N independent blocks
-#
-add_proto qw/void aom_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad64x64x4d avx2 neon msa sse2/;
-
-add_proto qw/void aom_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad64x32x4d msa sse2/;
-
-add_proto qw/void aom_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x64x4d msa sse2/;
-
-add_proto qw/void aom_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x32x4d avx2 neon msa sse2/;
-
-add_proto qw/void aom_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad32x16x4d msa sse2/;
-
-add_proto qw/void aom_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x32x4d msa sse2/;
-
-add_proto qw/void aom_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x16x4d neon msa sse2/;
-
-add_proto qw/void aom_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad16x8x4d msa sse2/;
-
-add_proto qw/void aom_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x16x4d msa sse2/;
-
-add_proto qw/void aom_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x8x4d msa sse2/;
-
-add_proto qw/void aom_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad8x4x4d msa sse2/;
-
-add_proto qw/void aom_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad4x8x4d msa sse2/;
-
-add_proto qw/void aom_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/aom_sad4x4x4d msa sse2/;
-
-#
-# Structured Similarity (SSIM)
-#
-if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
-
-    add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
-}
-
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  #
-  # Block subtraction
-  #
-  add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/aom_highbd_subtract_block/;
-
-  #
-  # Single block SAD
-  #
-  add_proto qw/unsigned int aom_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad64x64 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad64x32 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad32x64 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad32x32 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad32x16 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad16x32 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad16x16 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad16x8 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad8x16 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad8x8 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad8x4 sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad4x8/;
-
-  add_proto qw/unsigned int aom_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/aom_highbd_sad4x4/;
-
-  #
-  # Avg
-  #
-  add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/aom_highbd_avg_8x8/;
-  add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/aom_highbd_avg_4x4/;
-  add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/aom_highbd_minmax_8x8/;
-
-  add_proto qw/unsigned int aom_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad64x64_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad64x32_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad32x64_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad32x32_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad32x16_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad16x32_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad16x16_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad16x8_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad8x16_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad8x8_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad8x4_avg sse2/;
-
-  add_proto qw/unsigned int aom_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad4x8_avg/;
-
-  add_proto qw/unsigned int aom_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/aom_highbd_sad4x4_avg/;
-
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  add_proto qw/void aom_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad64x64x3/;
-
-  add_proto qw/void aom_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x32x3/;
-
-  add_proto qw/void aom_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x16x3/;
-
-  add_proto qw/void aom_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x8x3/;
-
-  add_proto qw/void aom_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x16x3/;
-
-  add_proto qw/void aom_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x8x3/;
-
-  add_proto qw/void aom_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad4x4x3/;
-
-  # Blocks of 8
-  add_proto qw/void aom_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad64x64x8/;
-
-  add_proto qw/void aom_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x32x8/;
-
-  add_proto qw/void aom_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x16x8/;
-
-  add_proto qw/void aom_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x8x8/;
-
-  add_proto qw/void aom_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x16x8/;
-
-  add_proto qw/void aom_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x8x8/;
-
-  add_proto qw/void aom_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x4x8/;
-
-  add_proto qw/void aom_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad4x8x8/;
-
-  add_proto qw/void aom_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad4x4x8/;
-
-  #
-  # Multi-block SAD, comparing a reference to N independent blocks
-  #
-  add_proto qw/void aom_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad64x64x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad64x32x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x64x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x32x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad32x16x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x32x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x16x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad16x8x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x16x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x8x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad8x4x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad4x8x4d sse2/;
-
-  add_proto qw/void aom_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/aom_highbd_sad4x4x4d sse2/;
-
-  #
-  # Structured Similarity (SSIM)
-  #
-  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_highbd_ssim_parms_8x8/;
-  }
-}  # CONFIG_AOM_HIGHBITDEPTH
-}  # CONFIG_ENCODERS
-
-if (aom_config("CONFIG_ENCODERS") eq "yes") {
-
-#
-# Variance
-#
-add_proto qw/unsigned int aom_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance64x64 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int aom_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance64x32 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int aom_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance32x64 sse2 neon msa/;
-
-add_proto qw/unsigned int aom_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance32x32 sse2 avx2 neon msa/;
-
-add_proto qw/unsigned int aom_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance32x16 sse2 avx2 msa/;
-
-add_proto qw/unsigned int aom_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance16x32 sse2 msa/;
-
-add_proto qw/unsigned int aom_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance16x16 sse2 avx2 media neon msa/;
-
-add_proto qw/unsigned int aom_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance16x8 sse2 neon msa/;
-
-add_proto qw/unsigned int aom_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance8x16 sse2 neon msa/;
-
-add_proto qw/unsigned int aom_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance8x8 sse2 media neon msa/;
-
-add_proto qw/unsigned int aom_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance8x4 sse2 msa/;
-
-add_proto qw/unsigned int aom_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance4x8 sse2 msa/;
-
-add_proto qw/unsigned int aom_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/aom_variance4x4 sse2 msa/;
-
-#
-# Specialty Variance
-#
-add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/aom_get16x16var sse2 avx2 neon msa/;
-
-add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/aom_get8x8var sse2 neon msa/;
-
-add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/aom_mse16x16 sse2 avx2 media neon msa/;
-
-add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/aom_mse16x8 sse2 msa/;
-
-add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/aom_mse8x16 sse2 msa/;
-
-add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/aom_mse8x8 sse2 msa/;
-
-add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
-  specialize qw/aom_get_mb_ss sse2 msa/;
-
-add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-  specialize qw/aom_get4x4sse_cs neon msa/;
-
-add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-
-add_proto qw/void aom_upsampled_pred/, "uint8_t *pred, int width, int height, const uint8_t *ref, const int ref_stride";
-specialize qw/aom_upsampled_pred sse2/;
-add_proto qw/void aom_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, const int ref_stride";
-specialize qw/aom_comp_avg_upsampled_pred sse2/;
-
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void aom_highbd_upsampled_pred/, "uint16_t *pred, int width, int height, const uint8_t *ref8, const int ref_stride";
-  specialize qw/aom_highbd_upsampled_pred sse2/;
-  add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, const int ref_stride";
-  specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
-}
-
-#
-# Subpixel Variance
-#
-add_proto qw/uint32_t aom_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance64x32 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance32x64 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance32x16 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance16x16 media neon msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance8x8 media neon msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/;
-
-add_proto qw/uint32_t aom_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/;
-
 add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
 
@@ -1619,6 +1582,10 @@
 add_proto qw/uint32_t aom_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
   specialize qw/aom_variance_halfpixvar16x16_hv sse2 media/;
 
+#
+# Comp Avg
+#
+add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/aom_highbd_12_variance64x64 sse2/;
@@ -1977,6 +1944,7 @@
   add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
 }  # CONFIG_AOM_HIGHBITDEPTH
+
 }  # CONFIG_ENCODERS
 
 1;

diff --git a/aom_dsp/aom_filter.h b/aom_dsp/aom_filter.h
index 1fc1cd6..04d113d 100644
--- a/aom_dsp/aom_filter.h
+++ b/aom_dsp/aom_filter.h

@@ -27,6 +27,15 @@
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index 2f28105..d9ef4f6 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c

@@ -198,3 +198,57 @@
     return s - ((t * t) >> shift_factor);
   }
 }
+
+void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                         int b_stride, int *min, int *max) {
+  // Load and concatenate.
+  const uint8x16_t a01 = vcombine_u8(vld1_u8(a), vld1_u8(a + a_stride));
+  const uint8x16_t a23 =
+      vcombine_u8(vld1_u8(a + 2 * a_stride), vld1_u8(a + 3 * a_stride));
+  const uint8x16_t a45 =
+      vcombine_u8(vld1_u8(a + 4 * a_stride), vld1_u8(a + 5 * a_stride));
+  const uint8x16_t a67 =
+      vcombine_u8(vld1_u8(a + 6 * a_stride), vld1_u8(a + 7 * a_stride));
+
+  const uint8x16_t b01 = vcombine_u8(vld1_u8(b), vld1_u8(b + b_stride));
+  const uint8x16_t b23 =
+      vcombine_u8(vld1_u8(b + 2 * b_stride), vld1_u8(b + 3 * b_stride));
+  const uint8x16_t b45 =
+      vcombine_u8(vld1_u8(b + 4 * b_stride), vld1_u8(b + 5 * b_stride));
+  const uint8x16_t b67 =
+      vcombine_u8(vld1_u8(b + 6 * b_stride), vld1_u8(b + 7 * b_stride));
+
+  // Absolute difference.
+  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+  // Max values between the Q vectors.
+  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+  // Split to D and start doing pairwise.
+  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+  // Enough runs of vpmax/min propogate the max/min values to every position.
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u8((uint8_t *)max, ab_max, 0);
+  vst1_lane_u8((uint8_t *)min, ab_min, 0);
+}

diff --git a/aom_dsp/arm/hadamard_neon.c b/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000..af955f0
--- /dev/null
+++ b/aom_dsp/arm/hadamard_neon.c

@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider
+// reversing transpose order which may make it easier for the compiler to
+// reconcile the vtrn.64 moves.
+static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                         int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                         int16x8_t *a6, int16x8_t *a7) {
+  // Swap 64 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 08 09 10 11 12 13 14 15
+  // a2: 16 17 18 19 20 21 22 23
+  // a3: 24 25 26 27 28 29 30 31
+  // a4: 32 33 34 35 36 37 38 39
+  // a5: 40 41 42 43 44 45 46 47
+  // a6: 48 49 50 51 52 53 54 55
+  // a7: 56 57 58 59 60 61 62 63
+  // to:
+  // a04_lo: 00 01 02 03 32 33 34 35
+  // a15_lo: 08 09 10 11 40 41 42 43
+  // a26_lo: 16 17 18 19 48 49 50 51
+  // a37_lo: 24 25 26 27 56 57 58 59
+  // a04_hi: 04 05 06 07 36 37 38 39
+  // a15_hi: 12 13 14 15 44 45 46 47
+  // a26_hi: 20 21 22 23 52 53 54 55
+  // a37_hi: 28 29 30 31 60 61 62 63
+  const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4));
+  const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5));
+  const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6));
+  const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7));
+  const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4));
+  const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5));
+  const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6));
+  const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7));
+
+  // Swap 32 bit elements resulting in:
+  // a0246_lo:
+  // 00 01 16 17 32 33 48 49
+  // 02 03 18 19 34 35 50 51
+  // a1357_lo:
+  // 08 09 24 25 40 41 56 57
+  // 10 11 26 27 42 43 58 59
+  // a0246_hi:
+  // 04 05 20 21 36 37 52 53
+  // 06 07 22 23 38 39 54 55
+  // a1657_hi:
+  // 12 13 28 29 44 45 60 61
+  // 14 15 30 31 46 47 62 63
+  const int32x4x2_t a0246_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo));
+  const int32x4x2_t a1357_lo =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo));
+  const int32x4x2_t a0246_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi));
+  const int32x4x2_t a1357_hi =
+      vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi));
+
+  // Swap 16 bit elements resulting in:
+  // b0:
+  // 00 08 16 24 32 40 48 56
+  // 01 09 17 25 33 41 49 57
+  // b1:
+  // 02 10 18 26 34 42 50 58
+  // 03 11 19 27 35 43 51 59
+  // b2:
+  // 04 12 20 28 36 44 52 60
+  // 05 13 21 29 37 45 53 61
+  // b3:
+  // 06 14 22 30 38 46 54 62
+  // 07 15 23 31 39 47 55 63
+  const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[0]));
+  const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]),
+                                   vreinterpretq_s16_s32(a1357_lo.val[1]));
+  const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[0]));
+  const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]),
+                                   vreinterpretq_s16_s32(a1357_hi.val[1]));
+
+  *a0 = b0.val[0];
+  *a1 = b0.val[1];
+  *a2 = b1.val[0];
+  *a3 = b1.val[1];
+  *a4 = b2.val[0];
+  *a5 = b2.val[1];
+  *a6 = b3.val[0];
+  *a7 = b3.val[1];
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+                           int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int i;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}

diff --git a/aom_dsp/avg.c b/aom_dsp/avg.c
index 18b5474..9add3d5 100644
--- a/aom_dsp/avg.c
+++ b/aom_dsp/avg.c

@@ -13,24 +13,24 @@
 #include "./aom_dsp_rtcd.h"
 #include "aom_ports/mem.h"
 
-unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int aom_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 8; ++i, s += p)
-    for (j = 0; j < 8; sum += s[j], ++j) {
+  for (i = 0; i < 8; ++i, src += stride)
+    for (j = 0; j < 8; sum += src[j], ++j) {
     }
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int aom_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 4; ++i, s += p)
-    for (j = 0; j < 4; sum += s[j], ++j) {
+  for (i = 0; i < 4; ++i, src += stride)
+    for (j = 0; j < 4; sum += src[j], ++j) {
     }
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
@@ -65,7 +65,9 @@
   coeff[5] = c3 - c7;
 }
 
-void aom_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void aom_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
                         int16_t *coeff) {
   int idx;
   int16_t buffer[64];
@@ -88,12 +90,12 @@
 }
 
 // In place 16x16 2D Hadamard transform
-void aom_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+void aom_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
                           int16_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
-    int16_t const *src_ptr =
+    const int16_t *src_ptr =
         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
     aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
   }
@@ -132,7 +134,7 @@
 
 // Integer projection onto row vectors.
 // height: value range {16, 32, 64}.
-void aom_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
@@ -148,7 +150,7 @@
 }
 
 // width: value range {16, 32, 64}.
-int16_t aom_int_pro_col_c(uint8_t const *ref, const int width) {
+int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
   int idx;
   int16_t sum = 0;
   // sum: 14 bit, dynamic range [0, 16320]
@@ -159,7 +161,7 @@
 // ref: [0 - 510]
 // src: [0 - 510]
 // bwl: {2, 3, 4}
-int aom_vector_var_c(int16_t const *ref, int16_t const *src, const int bwl) {
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
   int i;
   int width = 4 << bwl;
   int sse = 0, mean = 0, var;
@@ -175,14 +177,14 @@
   return var;
 }
 
-void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
-                      int *min, int *max) {
+void aom_minmax_8x8_c(const uint8_t *src, int src_stride, const uint8_t *ref,
+                      int ref_stride, int *min, int *max) {
   int i, j;
   *min = 255;
   *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
+  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
     for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j] - d[j]);
+      int diff = abs(src[j] - ref[j]);
       *min = diff < *min ? diff : *min;
       *max = diff > *max ? diff : *max;
     }
@@ -190,26 +192,26 @@
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
-unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 8; ++i, s += p)
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 8; ++i, s += stride)
     for (j = 0; j < 8; sum += s[j], ++j) {
     }
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 4; ++i, s += p)
+  const uint16_t *s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 4; ++i, s += stride)
     for (j = 0; j < 4; sum += s[j], ++j) {
     }
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,

diff --git a/aom_dsp/bitreader.h b/aom_dsp/bitreader.h
index 65cf900..478945b 100644
--- a/aom_dsp/bitreader.h
+++ b/aom_dsp/bitreader.h

@@ -12,21 +12,14 @@
 #ifndef AOM_DSP_BITREADER_H_
 #define AOM_DSP_BITREADER_H_
 
-#include <stddef.h>
+#include <assert.h>
 #include <limits.h>
 
 #include "./aom_config.h"
-
 #if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
 #error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL."
 #endif
 
-#if CONFIG_BITSTREAM_DEBUG
-#include <assert.h>
-#include <stdio.h>
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
 #if CONFIG_ANS
@@ -163,28 +156,6 @@
 #if CONFIG_ACCOUNTING
   if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
 #endif
-
-#if CONFIG_BITSTREAM_DEBUG
-  {
-    int ref_bit, ref_prob;
-    const int queue_r = bitstream_queue_get_read();
-    const int frame_idx = bitstream_queue_get_frame_read();
-    bitstream_queue_pop(&ref_bit, &ref_prob);
-    if (prob != ref_prob) {
-      fprintf(
-          stderr,
-          "\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n",
-          frame_idx, prob, ref_prob, queue_r);
-      assert(0);
-    }
-    if (ret != ref_bit) {
-      fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n",
-              frame_idx, ret, ref_bit);
-      assert(0);
-    }
-  }
-#endif  // CONFIG_BITSTREAM_DEBUG
-
   return ret;
 }
 
@@ -240,7 +211,7 @@
 static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
                                    int nsymbs ACCT_STR_PARAM) {
   int ret;
-#if CONFIG_ANS
+#if CONFIG_RANS
   (void)nsymbs;
   ret = rans_read(r, cdf);
 #elif CONFIG_DAALA_EC

diff --git a/aom_dsp/bitwriter.h b/aom_dsp/bitwriter.h
index 33fbd0b..b437669 100644
--- a/aom_dsp/bitwriter.h
+++ b/aom_dsp/bitwriter.h

@@ -14,16 +14,10 @@
 
 #include <assert.h>
 #include "./aom_config.h"
-
 #if CONFIG_EC_ADAPT && !CONFIG_EC_MULTISYMBOL
 #error "CONFIG_EC_ADAPT is enabled without enabling CONFIG_EC_MULTISYMBOL"
 #endif
 
-#if CONFIG_BITSTREAM_DEBUG
-#include <stdio.h>
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
 #if CONFIG_ANS
 #include "aom_dsp/buf_ans.h"
 #elif CONFIG_DAALA_EC
@@ -32,6 +26,11 @@
 #include "aom_dsp/dkboolwriter.h"
 #endif
 #include "aom_dsp/prob.h"
+
+#if CONFIG_RD_DEBUG
+#include "av1/encoder/cost.h"
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -44,6 +43,8 @@
 typedef struct aom_dk_writer aom_writer;
 #endif
 
+typedef struct TOKEN_STATS { int64_t cost; } TOKEN_STATS;
+
 static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
 #if CONFIG_ANS
   (void)bc;
@@ -75,24 +76,27 @@
 #else
   aom_dk_write(br, bit, probability);
 #endif
+}
 
-#if CONFIG_BITSTREAM_DEBUG
-  // int queue_r = 0;
-  // int frame_idx_r = 0;
-  // int queue_w = bitstream_queue_get_write();
-  // int frame_idx_w = bitstream_queue_get_frame_write();
-  // if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-  //   fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-  //   frame_idx_w, queue_w);
-  // }
-  bitstream_queue_push(bit, probability);
-#endif  // CONFIG_BITSTREAM_DEBUG
+static INLINE void aom_write_record(aom_writer *br, int bit, int probability,
+                                    TOKEN_STATS *token_stats) {
+  aom_write(br, bit, probability);
+#if CONFIG_RD_DEBUG
+  token_stats->cost += av1_cost_bit(probability, bit);
+#else
+  (void)token_stats;
+#endif
 }
 
 static INLINE void aom_write_bit(aom_writer *w, int bit) {
   aom_write(w, bit, 128);  // aom_prob_half
 }
 
+static INLINE void aom_write_bit_record(aom_writer *w, int bit,
+                                        TOKEN_STATS *token_stats) {
+  aom_write_record(w, bit, 128, token_stats);  // aom_prob_half
+}
+
 static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
   int bit;
 
@@ -109,6 +113,18 @@
   } while (len);
 }
 
+static INLINE void aom_write_tree_bits_record(aom_writer *w,
+                                              const aom_tree_index *tr,
+                                              const aom_prob *probs, int bits,
+                                              int len, aom_tree_index i,
+                                              TOKEN_STATS *token_stats) {
+  do {
+    const int bit = (bits >> --len) & 1;
+    aom_write_record(w, bit, probs[i >> 1], token_stats);
+    i = tr[i + bit];
+  } while (len);
+}
+
 static INLINE void aom_write_tree(aom_writer *w, const aom_tree_index *tree,
                                   const aom_prob *probs, int bits, int len,
                                   aom_tree_index i) {
@@ -119,10 +135,23 @@
 #endif
 }
 
+static INLINE void aom_write_tree_record(aom_writer *w,
+                                         const aom_tree_index *tree,
+                                         const aom_prob *probs, int bits,
+                                         int len, aom_tree_index i,
+                                         TOKEN_STATS *token_stats) {
+#if CONFIG_DAALA_EC
+  (void)token_stats;
+  daala_write_tree_bits(w, tree, probs, bits, len, i);
+#else
+  aom_write_tree_bits_record(w, tree, probs, bits, len, i, token_stats);
+#endif
+}
+
 #if CONFIG_EC_MULTISYMBOL
 static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
                                     int nsymbs) {
-#if CONFIG_ANS
+#if CONFIG_RANS
   struct rans_sym s;
   (void)nsymbs;
   assert(cdf);

diff --git a/aom_dsp/deblock.c b/aom_dsp/deblock.c
new file mode 100644
index 0000000..ec53891
--- /dev/null
+++ b/aom_dsp/deblock.c

@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ */
+
+#include <stdlib.h>
+#include "aom/aom_integer.h"
+
+const int16_t aom_rv[] = {
+  8,  5,  2,  2,  8,  12, 4,  9,  8,  3,  0,  3,  9,  0,  0,  0,  8,  3,  14,
+  4,  10, 1,  11, 14, 1,  14, 9,  6,  12, 11, 8,  6,  10, 0,  0,  8,  9,  0,
+  3,  14, 8,  11, 13, 4,  2,  9,  0,  3,  9,  6,  1,  2,  3,  14, 13, 1,  8,
+  2,  9,  7,  3,  3,  1,  13, 13, 6,  6,  5,  2,  7,  11, 9,  11, 8,  7,  3,
+  2,  0,  13, 13, 14, 4,  12, 5,  12, 10, 8,  10, 13, 10, 4,  14, 4,  10, 0,
+  8,  11, 1,  13, 7,  7,  14, 6,  14, 13, 2,  13, 5,  4,  4,  0,  10, 0,  5,
+  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,  7,  2,  2,  5,  3,  4,  7,
+  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,  0,  11, 8,  13, 1,  13, 1,
+  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,  1,  13, 14, 7,  6,  7,  9,
+  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,  8,  7,  10, 0,  8,  14, 11,
+  3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12, 12, 8,  0,  11, 13, 1,  2,
+  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,  3,  10, 5,  8,  0,  11, 6,
+  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,  4,  3,  5,  6,  10, 8,  9,
+  4,  11, 14, 0,  10, 0,  5,  13, 2,  12, 7,  11, 13, 8,  0,  4,  10, 7,  2,
+  7,  2,  2,  5,  3,  4,  7,  3,  3,  14, 14, 5,  9,  13, 3,  14, 3,  6,  3,
+  0,  11, 8,  13, 1,  13, 1,  12, 0,  10, 9,  7,  6,  2,  8,  5,  2,  13, 7,
+  1,  13, 14, 7,  6,  7,  9,  6,  10, 11, 7,  8,  7,  5,  14, 8,  4,  4,  0,
+  8,  7,  10, 0,  8,  14, 11, 3,  12, 5,  7,  14, 3,  14, 5,  2,  6,  11, 12,
+  12, 8,  0,  11, 13, 1,  2,  0,  5,  10, 14, 7,  8,  0,  4,  11, 0,  8,  0,
+  3,  10, 5,  8,  0,  11, 6,  7,  8,  10, 7,  13, 9,  2,  5,  1,  5,  10, 2,
+  4,  3,  5,  6,  10, 8,  9,  4,  11, 14, 3,  8,  3,  7,  8,  5,  11, 4,  12,
+  3,  11, 9,  14, 8,  14, 13, 4,  3,  1,  2,  14, 6,  5,  4,  4,  11, 4,  6,
+  2,  1,  5,  8,  8,  12, 13, 5,  14, 10, 12, 13, 0,  9,  5,  5,  11, 10, 13,
+  9,  10, 13,
+};
+
+void aom_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
+                                            unsigned char *dst_ptr,
+                                            int src_pixels_per_line,
+                                            int dst_pixels_per_line, int cols,
+                                            unsigned char *f, int size) {
+  unsigned char *p_src, *p_dst;
+  int row;
+  int col;
+  unsigned char v;
+  unsigned char d[4];
+
+  for (row = 0; row < size; row++) {
+    /* post_proc_down for one row */
+    p_src = src_ptr;
+    p_dst = dst_ptr;
+
+    for (col = 0; col < cols; col++) {
+      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
+      unsigned char p_above1 = p_src[col - src_pixels_per_line];
+      unsigned char p_below1 = p_src[col + src_pixels_per_line];
+      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+
+      v = p_src[col];
+
+      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
+          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+        unsigned char k1, k2, k3;
+        k1 = (p_above2 + p_above1 + 1) >> 1;
+        k2 = (p_below2 + p_below1 + 1) >> 1;
+        k3 = (k1 + k2 + 1) >> 1;
+        v = (k3 + v + 1) >> 1;
+      }
+
+      p_dst[col] = v;
+    }
+
+    /* now post_proc_across */
+    p_src = dst_ptr;
+    p_dst = dst_ptr;
+
+    p_src[-2] = p_src[-1] = p_src[0];
+    p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
+
+    for (col = 0; col < cols; col++) {
+      v = p_src[col];
+
+      if ((abs(v - p_src[col - 2]) < f[col]) &&
+          (abs(v - p_src[col - 1]) < f[col]) &&
+          (abs(v - p_src[col + 1]) < f[col]) &&
+          (abs(v - p_src[col + 2]) < f[col])) {
+        unsigned char k1, k2, k3;
+        k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
+        k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
+        k3 = (k1 + k2 + 1) >> 1;
+        v = (k3 + v + 1) >> 1;
+      }
+
+      d[col & 3] = v;
+
+      if (col >= 2) p_dst[col - 2] = d[(col - 2) & 3];
+    }
+
+    /* handle the last two pixels */
+    p_dst[col - 2] = d[(col - 2) & 3];
+    p_dst[col - 1] = d[(col - 1) & 3];
+
+    /* next row */
+    src_ptr += src_pixels_per_line;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+
+void aom_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows,
+                                 int cols, int flimit) {
+  int r, c, i;
+
+  unsigned char *s = src;
+  unsigned char d[16];
+
+  for (r = 0; r < rows; r++) {
+    int sumsq = 0;
+    int sum = 0;
+
+    for (i = -8; i < 0; i++) s[i] = s[0];
+
+    /* 17 avoids valgrind warning - we buffer values in c in d
+     * and only write them when we've read 8 ahead...
+     */
+    for (i = 0; i < 17; i++) s[i + cols] = s[cols - 1];
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i] * s[i];
+      sum += s[i];
+      d[i + 8] = 0;
+    }
+
+    for (c = 0; c < cols + 8; c++) {
+      int x = s[c + 7] - s[c - 8];
+      int y = s[c + 7] + s[c - 8];
+
+      sum += x;
+      sumsq += x * y;
+
+      d[c & 15] = s[c];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[c & 15] = (8 + sum + s[c]) >> 4;
+      }
+
+      s[c - 8] = d[(c - 8) & 15];
+    }
+
+    s += pitch;
+  }
+}
+
+void aom_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,
+                            int flimit) {
+  int r, c, i;
+  const int16_t *rv3 = &aom_rv[63 & rand()];
+
+  for (c = 0; c < cols; c++) {
+    unsigned char *s = &dst[c];
+    int sumsq = 0;
+    int sum = 0;
+    unsigned char d[16];
+    const int16_t *rv2 = rv3 + ((c * 17) & 127);
+
+    for (i = -8; i < 0; i++) s[i * pitch] = s[0];
+
+    /* 17 avoids valgrind warning - we buffer values in c in d
+     * and only write them when we've read 8 ahead...
+     */
+    for (i = 0; i < 17; i++) s[(i + rows) * pitch] = s[(rows - 1) * pitch];
+
+    for (i = -8; i <= 6; i++) {
+      sumsq += s[i * pitch] * s[i * pitch];
+      sum += s[i * pitch];
+    }
+
+    for (r = 0; r < rows + 8; r++) {
+      sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 * pitch];
+      sum += s[7 * pitch] - s[-8 * pitch];
+      d[r & 15] = s[0];
+
+      if (sumsq * 15 - sum * sum < flimit) {
+        d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4;
+      }
+      if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15];
+      s += pitch;
+    }
+  }
+}

diff --git a/aom_dsp/dkboolreader.c b/aom_dsp/dkboolreader.c
index 805ea29..4079d70 100644
--- a/aom_dsp/dkboolreader.c
+++ b/aom_dsp/dkboolreader.c

@@ -1,12 +1,11 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <stdlib.h>
 
@@ -75,7 +74,7 @@
     buffer += (bits >> 3);
     value = r->value | (nv << (shift & 0x7));
   } else {
-    const int bits_over = (int)(shift + CHAR_BIT - bits_left);
+    const int bits_over = (int)(shift + CHAR_BIT - (int)bits_left);
     int loop_end = 0;
     if (bits_over >= 0) {
       count += LOTS_OF_BITS;

diff --git a/aom_dsp/dkboolreader.h b/aom_dsp/dkboolreader.h
index e3661e2..add480a 100644
--- a/aom_dsp/dkboolreader.h
+++ b/aom_dsp/dkboolreader.h

@@ -17,6 +17,12 @@
 #include <limits.h>
 
 #include "./aom_config.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include <assert.h>
+#include <stdio.h>
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
 #include "aom_ports/mem.h"
 #include "aom/aomdx.h"
 #include "aom/aom_integer.h"
@@ -143,6 +149,27 @@
   r->count = count;
   r->range = range;
 
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int ref_bit, ref_prob;
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_bit, &ref_prob);
+    if (prob != ref_prob) {
+      fprintf(
+          stderr,
+          "\n *** prob error, frame_idx_r %d prob %d ref_prob %d queue_r %d\n",
+          frame_idx, prob, ref_prob, queue_r);
+      assert(0);
+    }
+    if ((int)bit != ref_bit) {
+      fprintf(stderr, "\n *** bit error, frame_idx_r %d bit %d ref_bit %d\n",
+              frame_idx, bit, ref_bit);
+      assert(0);
+    }
+  }
+#endif  // CONFIG_BITSTREAM_DEBUG
+
   return bit;
 }
 

diff --git a/aom_dsp/dkboolwriter.c b/aom_dsp/dkboolwriter.c
index 2e44df2..fc98e7c 100644
--- a/aom_dsp/dkboolwriter.c
+++ b/aom_dsp/dkboolwriter.c

@@ -29,8 +29,16 @@
 void aom_dk_stop_encode(aom_dk_writer *br) {
   int i;
 
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_skip_write(1);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
   for (i = 0; i < 32; i++) aom_dk_write_bit(br, 0);
 
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_skip_write(0);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
   // Ensure there's no ambigous collision with any index marker bytes
   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
 }

diff --git a/aom_dsp/dkboolwriter.h b/aom_dsp/dkboolwriter.h
index 256c3a5..8354368 100644
--- a/aom_dsp/dkboolwriter.h
+++ b/aom_dsp/dkboolwriter.h

@@ -12,9 +12,15 @@
 #ifndef AOM_DSP_DKBOOLWRITER_H_
 #define AOM_DSP_DKBOOLWRITER_H_
 
-#include "aom_ports/mem.h"
+#include "./aom_config.h"
+
+#if CONFIG_BITSTREAM_DEBUG
+#include <stdio.h>
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "aom_dsp/prob.h"
+#include "aom_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,6 +44,18 @@
   unsigned int lowvalue = br->lowvalue;
   register int shift;
 
+#if CONFIG_BITSTREAM_DEBUG
+  // int queue_r = 0;
+  // int frame_idx_r = 0;
+  // int queue_w = bitstream_queue_get_write();
+  // int frame_idx_w = bitstream_queue_get_frame_write();
+  // if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+  //   fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+  //   frame_idx_w, queue_w);
+  // }
+  bitstream_queue_push(bit, probability);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
   split = 1 + (((range - 1) * probability) >> 8);
 
   range = split;

diff --git a/aom_dsp/fwd_txfm.c b/aom_dsp/fwd_txfm.c
index 12745ab..547919f 100644
--- a/aom_dsp/fwd_txfm.c
+++ b/aom_dsp/fwd_txfm.c

@@ -9,8 +9,9 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <assert.h>
 #include "aom_dsp/fwd_txfm.h"
+#include <assert.h>
+#include "./aom_dsp_rtcd.h"
 
 void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   // The 2D transform is done with two passes which are actually pretty
@@ -85,7 +86,6 @@
     for (c = 0; c < 4; ++c) sum += input[r * stride + c];
 
   output[0] = sum << 1;
-  output[1] = 0;
 }
 
 void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
@@ -179,7 +179,6 @@
     for (c = 0; c < 8; ++c) sum += input[r * stride + c];
 
   output[0] = sum;
-  output[1] = 0;
 }
 
 void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
@@ -364,12 +363,11 @@
 
 void aom_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  tran_low_t sum = 0;
+  int sum = 0;
   for (r = 0; r < 16; ++r)
     for (c = 0; c < 16; ++c) sum += input[r * stride + c];
 
-  output[0] = sum >> 1;
-  output[1] = 0;
+  output[0] = (tran_low_t)(sum >> 1);
 }
 
 static INLINE tran_high_t dct_32_round(tran_high_t input) {
@@ -762,12 +760,11 @@
 
 void aom_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
-  tran_low_t sum = 0;
+  int sum = 0;
   for (r = 0; r < 32; ++r)
     for (c = 0; c < 32; ++c) sum += input[r * stride + c];
 
-  output[0] = sum >> 3;
-  output[1] = 0;
+  output[0] = (tran_low_t)(sum >> 3);
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH

diff --git a/aom_dsp/intrapred.c b/aom_dsp/intrapred.c
index e33f320..29b5a74 100644
--- a/aom_dsp/intrapred.c
+++ b/aom_dsp/intrapred.c

@@ -19,6 +19,30 @@
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
+static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  (void)above;
+  // first column
+  for (r = 0; r < bs - 1; ++r) dst[r * stride] = AVG2(left[r], left[r + 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // second column
+  for (r = 0; r < bs - 2; ++r)
+    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // rest of last row
+  for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
+
+  for (r = bs - 2; r >= 0; --r)
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+}
+
 static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                    const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -34,6 +58,23 @@
   }
 }
 
+static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  int size;
+  (void)left;
+  for (c = 0; c < bs; ++c) {
+    dst[c] = AVG2(above[c], above[c + 1]);
+    dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+  }
+  for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+    memcpy(dst + (r + 0) * stride, dst + (r >> 1), size);
+    memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+    memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+    memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+  }
+}
+
 static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -48,6 +89,25 @@
   }
 }
 
+static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint8_t above_right = above[bs - 1];
+  const uint8_t *const dst_row0 = dst;
+  int x, size;
+  (void)left;
+
+  for (x = 0; x < bs - 1; ++x) {
+    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+  }
+  dst[bs - 1] = above_right;
+  dst += stride;
+  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+    memcpy(dst, dst_row0 + x, size);
+    memset(dst + size, above_right, x + 1);
+    dst += stride;
+  }
+}
+
 static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -88,18 +148,29 @@
 
 static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
-  int r, c;
-  dst[0] = AVG3(left[0], above[-1], above[0]);
-  for (c = 1; c < bs; c++) dst[c] = AVG3(above[c - 2], above[c - 1], above[c]);
+  int i;
+#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ > 7
+  // silence a spurious -Warray-bounds warning, possibly related to:
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56273
+  uint8_t border[69];
+#else
+  uint8_t border[32 + 32 - 1];  // outer border from bottom-left to top-right
+#endif
 
-  dst[stride] = AVG3(above[-1], left[0], left[1]);
-  for (r = 2; r < bs; ++r)
-    dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]);
+  // dst(bs, bs - 2)[0], i.e., border starting at bottom-left
+  for (i = 0; i < bs - 2; ++i) {
+    border[i] = AVG3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+  }
+  border[bs - 2] = AVG3(above[-1], left[0], left[1]);
+  border[bs - 1] = AVG3(left[0], above[-1], above[0]);
+  border[bs - 0] = AVG3(above[-1], above[0], above[1]);
+  // dst[0][2, size), i.e., remaining top border ascending
+  for (i = 0; i < bs - 2; ++i) {
+    border[bs + 1 + i] = AVG3(above[i], above[i + 1], above[i + 2]);
+  }
 
-  dst += stride;
-  for (r = 1; r < bs; ++r) {
-    for (c = 1; c < bs; c++) dst[c] = dst[-stride + c - 1];
-    dst += stride;
+  for (i = 0; i < bs; ++i) {
+    memcpy(dst + i * stride, border + bs - 1 - i, bs);
   }
 }
 
@@ -148,6 +219,36 @@
   }
 }
 
+#if CONFIG_ALT_INTRA
+static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
+
+static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
+                                              uint16_t top_left) {
+  const int base = top + left - top_left;
+  const int p_left = abs_diff(base, left);
+  const int p_top = abs_diff(base, top);
+  const int p_top_left = abs_diff(base, top_left);
+
+  // Return nearest to base of left, top and top_left.
+  return (p_left <= p_top && p_left <= p_top_left)
+             ? left
+             : (p_top <= p_top_left) ? top : top_left;
+}
+
+static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  int r, c;
+  const uint8_t ytop_left = above[-1];
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+#else
+
 static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                 const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -159,6 +260,7 @@
     dst += stride;
   }
 }
+#endif  // CONFIG_ALT_INTRA
 
 static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                     const uint8_t *above, const uint8_t *left) {
@@ -411,13 +513,13 @@
   DST(1, 0) = DST(0, 2) = AVG2(B, C);
   DST(2, 0) = DST(1, 2) = AVG2(C, D);
   DST(3, 0) = DST(2, 2) = AVG2(D, E);
-  DST(3, 2) = AVG2(E, F);  // differs from aom
+  DST(3, 2) = AVG2(E, F);  // differs from vp8
 
   DST(0, 1) = AVG3(A, B, C);
   DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
   DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
   DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
-  DST(3, 3) = AVG3(E, F, G);  // differs from aom
+  DST(3, 3) = AVG3(E, F, G);  // differs from vp8
 }
 
 void aom_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -462,7 +564,7 @@
   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
   DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
   DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
-  DST(3, 3) = H;  // differs from aom
+  DST(3, 3) = H;  // differs from vp8
 }
 
 void aom_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride,
@@ -555,6 +657,37 @@
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
+static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  int r, c;
+  (void)above;
+  (void)bd;
+
+  // First column.
+  for (r = 0; r < bs - 1; ++r) {
+    dst[r * stride] = AVG2(left[r], left[r + 1]);
+  }
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Second column.
+  for (r = 0; r < bs - 2; ++r) {
+    dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]);
+  }
+  dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]);
+  dst[(bs - 1) * stride] = left[bs - 1];
+  dst++;
+
+  // Rest of last row.
+  for (c = 0; c < bs - 2; ++c) dst[(bs - 1) * stride + c] = left[bs - 1];
+
+  for (r = bs - 2; r >= 0; --r) {
+    for (c = 0; c < bs - 2; ++c)
+      dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+  }
+}
+
 static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bs, const uint16_t *above,
                                           const uint16_t *left, int bd) {
@@ -572,9 +705,9 @@
   }
 }
 
-static INLINE void highbd_d63e_predictor(uint16_t *dst, ptrdiff_t stride,
-                                         int bs, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
+static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
   int r, c;
   (void)left;
   (void)bd;
@@ -588,6 +721,24 @@
   }
 }
 
+#define highbd_d63e_predictor highbd_d63_predictor
+
+static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  int r, c;
+  (void)left;
+  (void)bd;
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      dst[c] = r + c + 2 < bs * 2
+                   ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
+                   : above[bs * 2 - 1];
+    }
+    dst += stride;
+  }
+}
+
 static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
@@ -698,6 +849,22 @@
   }
 }
 
+#if CONFIG_ALT_INTRA
+static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  int r, c;
+  const uint16_t ytop_left = above[-1];
+  (void)bd;
+
+  for (r = 0; r < bs; r++) {
+    for (c = 0; c < bs; c++)
+      dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+#else
 static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
@@ -711,6 +878,7 @@
     dst += stride;
   }
 }
+#endif  // CONFIG_ALT_INTRA
 
 static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
                                            int bs, const uint16_t *above,
@@ -796,41 +964,60 @@
     highbd_##type##_predictor(dst, stride, size, above, left, bd); \
   }
 
-#define intra_pred_allsizes(type)                                       \
-  intra_pred_sized(type, 2) intra_pred_sized(type, 4) intra_pred_sized( \
-      type, 8) intra_pred_sized(type, 16) intra_pred_sized(type, 32)    \
-      intra_pred_highbd_sized(type, 4) intra_pred_highbd_sized(type, 8) \
-          intra_pred_highbd_sized(type, 16) intra_pred_highbd_sized(type, 32)
+/* clang-format off */
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32)
 
-#define intra_pred_above_4x4(type)                                           \
-  intra_pred_sized(type, 8) intra_pred_sized(type, 16)                       \
-      intra_pred_sized(type, 32) intra_pred_highbd_sized(type, 4)            \
-          intra_pred_highbd_sized(type, 8) intra_pred_highbd_sized(type, 16) \
-              intra_pred_highbd_sized(type, 32)
+#define intra_pred_above_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32) \
+  intra_pred_highbd_sized(type, 4) \
+  intra_pred_highbd_sized(type, 8) \
+  intra_pred_highbd_sized(type, 16) \
+  intra_pred_highbd_sized(type, 32)
 
 #else
-#define intra_pred_allsizes(type)                                       \
-  intra_pred_sized(type, 2) intra_pred_sized(type, 4) intra_pred_sized( \
-      type, 8) intra_pred_sized(type, 16) intra_pred_sized(type, 32)
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 2) \
+  intra_pred_sized(type, 4) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32)
 
-#define intra_pred_above_4x4(type)                     \
-  intra_pred_sized(type, 8) intra_pred_sized(type, 16) \
-      intra_pred_sized(type, 32)
+#define intra_pred_above_4x4(type) \
+  intra_pred_sized(type, 8) \
+  intra_pred_sized(type, 16) \
+  intra_pred_sized(type, 32)
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-/* clang-format off */
-intra_pred_allsizes(d207e)    // NOLINT
-intra_pred_allsizes(d63e)     // NOLINT
-intra_pred_above_4x4(d45e)    // NOLINT
-intra_pred_above_4x4(d117)    // NOLINT
-intra_pred_above_4x4(d135)    // NOLINT
-intra_pred_above_4x4(d153)    // NOLINT
-intra_pred_allsizes(v)        // NOLINT
-intra_pred_allsizes(h)        // NOLINT
-intra_pred_allsizes(tm)       // NOLINT
-intra_pred_allsizes(dc_128)   // NOLINT
-intra_pred_allsizes(dc_left)  // NOLINT
-intra_pred_allsizes(dc_top)   // NOLINT
-intra_pred_allsizes(dc)       // NOLINT
+intra_pred_above_4x4(d207)
+intra_pred_above_4x4(d63)
+intra_pred_above_4x4(d45)
+intra_pred_allsizes(d207e)
+intra_pred_allsizes(d63e)
+intra_pred_above_4x4(d45e)
+intra_pred_above_4x4(d117)
+intra_pred_above_4x4(d135)
+intra_pred_above_4x4(d153)
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+#if CONFIG_ALT_INTRA
+intra_pred_allsizes(paeth)
+#else
+intra_pred_allsizes(tm)
+#endif  // CONFIG_ALT_INTRA
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_allsizes(dc)
+/* clang-format on */
 #undef intra_pred_allsizes
-    /* clang-format on */

diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index 616ddfc..4bb656b 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c

@@ -12,6 +12,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "./aom_dsp_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
 
 void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -35,10 +36,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, 8);
-    op[1] = WRAPLOW(b1, 8);
-    op[2] = WRAPLOW(c1, 8);
-    op[3] = WRAPLOW(d1, 8);
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
     ip += 4;
     op += 4;
   }
@@ -56,10 +57,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
+    dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
+    dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
+    dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
+    dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
 
     ip++;
     dest++;
@@ -76,8 +77,8 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, 8);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -98,18 +99,18 @@
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(step[1] + step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[0] - step[3], 8);
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
 }
 
 void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -140,8 +141,8 @@
                          int dest_stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -163,48 +164,48 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   temp1 = (step1[0] + step1[2]) * cospi_16_64;
   temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(step1[1] + step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(step1[3] + step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(step1[2] - step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(step1[0] - step1[7], 8);
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
 }
 
 void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -234,8 +235,8 @@
 void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
@@ -263,7 +264,7 @@
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
+  s7 = WRAPLOW(x0 - x2 + x3);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -274,10 +275,10 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
 }
 
 void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
@@ -308,14 +309,14 @@
   s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
   s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
 
   // stage 2
   s0 = (int)x0;
@@ -327,14 +328,14 @@
   s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
   s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 
-  x0 = WRAPLOW(s0 + s2, 8);
-  x1 = WRAPLOW(s1 + s3, 8);
-  x2 = WRAPLOW(s0 - s2, 8);
-  x3 = WRAPLOW(s1 - s3, 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
 
   // stage 3
   s2 = (int)(cospi_16_64 * (x2 + x3));
@@ -342,19 +343,19 @@
   s6 = (int)(cospi_16_64 * (x6 + x7));
   s7 = (int)(cospi_16_64 * (x6 - x7));
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
 
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x4, 8);
-  output[2] = WRAPLOW(x6, 8);
-  output[3] = WRAPLOW(-x2, 8);
-  output[4] = WRAPLOW(x3, 8);
-  output[5] = WRAPLOW(-x7, 8);
-  output[6] = WRAPLOW(x5, 8);
-  output[7] = WRAPLOW(-x1, 8);
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
 }
 
 void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
@@ -416,23 +417,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 3
   step1[0] = step2[0];
@@ -442,109 +443,109 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(step2[1] + step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(step2[3] + step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(step2[5] + step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(step2[7] + step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(step2[6] - step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(step2[4] - step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(step2[2] - step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(step2[0] - step2[15], 8);
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
 }
 
 void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
@@ -619,22 +620,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
 
   // stage 2
   s0 = x0;
@@ -654,22 +655,22 @@
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, 8);
-  x1 = WRAPLOW(s1 + s5, 8);
-  x2 = WRAPLOW(s2 + s6, 8);
-  x3 = WRAPLOW(s3 + s7, 8);
-  x4 = WRAPLOW(s0 - s4, 8);
-  x5 = WRAPLOW(s1 - s5, 8);
-  x6 = WRAPLOW(s2 - s6, 8);
-  x7 = WRAPLOW(s3 - s7, 8);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
 
   // stage 3
   s0 = x0;
@@ -689,22 +690,22 @@
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(check_range(s0 + s2), 8);
-  x1 = WRAPLOW(check_range(s1 + s3), 8);
-  x2 = WRAPLOW(check_range(s0 - s2), 8);
-  x3 = WRAPLOW(check_range(s1 - s3), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-  x8 = WRAPLOW(check_range(s8 + s10), 8);
-  x9 = WRAPLOW(check_range(s9 + s11), 8);
-  x10 = WRAPLOW(check_range(s8 - s10), 8);
-  x11 = WRAPLOW(check_range(s9 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
 
   // stage 4
   s2 = (-cospi_16_64) * (x2 + x3);
@@ -716,31 +717,31 @@
   s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
 
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x8, 8);
-  output[2] = WRAPLOW(x12, 8);
-  output[3] = WRAPLOW(-x4, 8);
-  output[4] = WRAPLOW(x6, 8);
-  output[5] = WRAPLOW(x14, 8);
-  output[6] = WRAPLOW(x10, 8);
-  output[7] = WRAPLOW(x2, 8);
-  output[8] = WRAPLOW(x3, 8);
-  output[9] = WRAPLOW(x11, 8);
-  output[10] = WRAPLOW(x15, 8);
-  output[11] = WRAPLOW(x7, 8);
-  output[12] = WRAPLOW(x5, 8);
-  output[13] = WRAPLOW(-x13, 8);
-  output[14] = WRAPLOW(x9, 8);
-  output[15] = WRAPLOW(-x1, 8);
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
 }
 
 void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
@@ -772,8 +773,8 @@
 void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
@@ -805,43 +806,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
   // stage 2
   step2[0] = step1[0];
@@ -855,40 +856,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
 
   // stage 3
   step1[0] = step2[0];
@@ -898,42 +899,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -942,87 +943,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
 
-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -1031,62 +1032,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
 
-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -1094,58 +1095,58 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], 8);
-  output[1] = WRAPLOW(step1[1] + step1[30], 8);
-  output[2] = WRAPLOW(step1[2] + step1[29], 8);
-  output[3] = WRAPLOW(step1[3] + step1[28], 8);
-  output[4] = WRAPLOW(step1[4] + step1[27], 8);
-  output[5] = WRAPLOW(step1[5] + step1[26], 8);
-  output[6] = WRAPLOW(step1[6] + step1[25], 8);
-  output[7] = WRAPLOW(step1[7] + step1[24], 8);
-  output[8] = WRAPLOW(step1[8] + step1[23], 8);
-  output[9] = WRAPLOW(step1[9] + step1[22], 8);
-  output[10] = WRAPLOW(step1[10] + step1[21], 8);
-  output[11] = WRAPLOW(step1[11] + step1[20], 8);
-  output[12] = WRAPLOW(step1[12] + step1[19], 8);
-  output[13] = WRAPLOW(step1[13] + step1[18], 8);
-  output[14] = WRAPLOW(step1[14] + step1[17], 8);
-  output[15] = WRAPLOW(step1[15] + step1[16], 8);
-  output[16] = WRAPLOW(step1[15] - step1[16], 8);
-  output[17] = WRAPLOW(step1[14] - step1[17], 8);
-  output[18] = WRAPLOW(step1[13] - step1[18], 8);
-  output[19] = WRAPLOW(step1[12] - step1[19], 8);
-  output[20] = WRAPLOW(step1[11] - step1[20], 8);
-  output[21] = WRAPLOW(step1[10] - step1[21], 8);
-  output[22] = WRAPLOW(step1[9] - step1[22], 8);
-  output[23] = WRAPLOW(step1[8] - step1[23], 8);
-  output[24] = WRAPLOW(step1[7] - step1[24], 8);
-  output[25] = WRAPLOW(step1[6] - step1[25], 8);
-  output[26] = WRAPLOW(step1[5] - step1[26], 8);
-  output[27] = WRAPLOW(step1[4] - step1[27], 8);
-  output[28] = WRAPLOW(step1[3] - step1[28], 8);
-  output[29] = WRAPLOW(step1[2] - step1[29], 8);
-  output[30] = WRAPLOW(step1[1] - step1[30], 8);
-  output[31] = WRAPLOW(step1[0] - step1[31], 8);
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
 }
 
 void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1241,8 +1242,8 @@
   int i, j;
   tran_high_t a1;
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
@@ -1275,10 +1276,10 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    op[0] = WRAPLOW(a1, bd);
-    op[1] = WRAPLOW(b1, bd);
-    op[2] = WRAPLOW(c1, bd);
-    op[3] = WRAPLOW(d1, bd);
+    op[0] = HIGHBD_WRAPLOW(a1, bd);
+    op[1] = HIGHBD_WRAPLOW(b1, bd);
+    op[2] = HIGHBD_WRAPLOW(c1, bd);
+    op[3] = HIGHBD_WRAPLOW(d1, bd);
     ip += 4;
     op += 4;
   }
@@ -1296,10 +1297,14 @@
     c1 = e1 - c1;
     a1 -= b1;
     d1 += c1;
-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+    dest[stride * 0] =
+        highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
+    dest[stride * 1] =
+        highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
+    dest[stride * 2] =
+        highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
+    dest[stride * 3] =
+        highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
 
     ip++;
     dest++;
@@ -1319,8 +1324,8 @@
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
   e1 = a1 >> 1;
   a1 -= e1;
-  op[0] = WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
+  op[0] = HIGHBD_WRAPLOW(a1, bd);
+  op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
 
   ip = tmp;
   for (i = 0; i < 4; i++) {
@@ -1346,18 +1351,18 @@
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], bd);
-  output[1] = WRAPLOW(step[1] + step[2], bd);
-  output[2] = WRAPLOW(step[1] - step[2], bd);
-  output[3] = WRAPLOW(step[0] - step[3], bd);
+  output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
+  output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
+  output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
+  output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
 }
 
 void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1391,10 +1396,10 @@
   int i;
   tran_high_t a1;
   tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1416,39 +1421,39 @@
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2 & stage 3 - even half
   aom_highbd_idct4_c(step1, step1, bd);
 
   // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   // stage 3 - odd half
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
   // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = WRAPLOW(step1[0] - step1[7], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
 }
 
 void aom_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1482,9 +1487,9 @@
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1513,7 +1518,7 @@
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)(x0 - x2 + x3);
+  s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
 
   s0 = s0 + s3 + s5;
   s1 = s1 - s4 - s6;
@@ -1524,10 +1529,10 @@
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
+  output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd);
+  output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd);
+  output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd);
 }
 
 void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1558,14 +1563,14 @@
   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd);
 
   // stage 2
   s0 = x0;
@@ -1577,14 +1582,14 @@
   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1592,19 +1597,19 @@
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
 
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x4, bd);
-  output[2] = WRAPLOW(x6, bd);
-  output[3] = WRAPLOW(-x2, bd);
-  output[4] = WRAPLOW(x3, bd);
-  output[5] = WRAPLOW(-x7, bd);
-  output[6] = WRAPLOW(x5, bd);
-  output[7] = WRAPLOW(-x1, bd);
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x4, bd);
+  output[2] = HIGHBD_WRAPLOW(x6, bd);
+  output[3] = HIGHBD_WRAPLOW(-x2, bd);
+  output[4] = HIGHBD_WRAPLOW(x3, bd);
+  output[5] = HIGHBD_WRAPLOW(-x7, bd);
+  output[6] = HIGHBD_WRAPLOW(x5, bd);
+  output[7] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void aom_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1668,23 +1673,23 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1694,109 +1699,109 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
   // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = WRAPLOW(step2[0] - step2[15], bd);
+  output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 }
 
 void aom_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -1871,22 +1876,22 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd);
+  x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd);
 
   // stage 2
   s0 = x0;
@@ -1906,22 +1911,22 @@
   s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
   s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
 
-  x0 = WRAPLOW(s0 + s4, bd);
-  x1 = WRAPLOW(s1 + s5, bd);
-  x2 = WRAPLOW(s2 + s6, bd);
-  x3 = WRAPLOW(s3 + s7, bd);
-  x4 = WRAPLOW(s0 - s4, bd);
-  x5 = WRAPLOW(s1 - s5, bd);
-  x6 = WRAPLOW(s2 - s6, bd);
-  x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+  x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+  x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+  x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+  x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+  x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+  x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+  x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd);
+  x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd);
 
   // stage 3
   s0 = x0;
@@ -1941,22 +1946,22 @@
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-  x8 = WRAPLOW(s8 + s10, bd);
-  x9 = WRAPLOW(s9 + s11, bd);
-  x10 = WRAPLOW(s8 - s10, bd);
-  x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
+  x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+  x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+  x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+  x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+  x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd);
+  x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd);
+  x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+  x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+  x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+  x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+  x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd);
+  x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
   s2 = (-cospi_16_64) * (x2 + x3);
@@ -1968,31 +1973,31 @@
   s14 = (-cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
+  x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd);
+  x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd);
+  x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd);
+  x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd);
+  x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd);
+  x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd);
+  x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd);
+  x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd);
 
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x8, bd);
-  output[2] = WRAPLOW(x12, bd);
-  output[3] = WRAPLOW(-x4, bd);
-  output[4] = WRAPLOW(x6, bd);
-  output[5] = WRAPLOW(x14, bd);
-  output[6] = WRAPLOW(x10, bd);
-  output[7] = WRAPLOW(x2, bd);
-  output[8] = WRAPLOW(x3, bd);
-  output[9] = WRAPLOW(x11, bd);
-  output[10] = WRAPLOW(x15, bd);
-  output[11] = WRAPLOW(x7, bd);
-  output[12] = WRAPLOW(x5, bd);
-  output[13] = WRAPLOW(-x13, bd);
-  output[14] = WRAPLOW(x9, bd);
-  output[15] = WRAPLOW(-x1, bd);
+  output[0] = HIGHBD_WRAPLOW(x0, bd);
+  output[1] = HIGHBD_WRAPLOW(-x8, bd);
+  output[2] = HIGHBD_WRAPLOW(x12, bd);
+  output[3] = HIGHBD_WRAPLOW(-x4, bd);
+  output[4] = HIGHBD_WRAPLOW(x6, bd);
+  output[5] = HIGHBD_WRAPLOW(x14, bd);
+  output[6] = HIGHBD_WRAPLOW(x10, bd);
+  output[7] = HIGHBD_WRAPLOW(x2, bd);
+  output[8] = HIGHBD_WRAPLOW(x3, bd);
+  output[9] = HIGHBD_WRAPLOW(x11, bd);
+  output[10] = HIGHBD_WRAPLOW(x15, bd);
+  output[11] = HIGHBD_WRAPLOW(x7, bd);
+  output[12] = HIGHBD_WRAPLOW(x5, bd);
+  output[13] = HIGHBD_WRAPLOW(-x13, bd);
+  output[14] = HIGHBD_WRAPLOW(x9, bd);
+  output[15] = HIGHBD_WRAPLOW(-x1, bd);
 }
 
 void aom_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2027,10 +2032,10 @@
   int i, j;
   tran_high_t a1;
   tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2038,8 +2043,7 @@
   }
 }
 
-static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
-                            int bd) {
+void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
   (void)bd;
@@ -2064,43 +2068,43 @@
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2114,40 +2118,40 @@
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
+  step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -2157,42 +2161,42 @@
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
 
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2201,87 +2205,87 @@
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+  step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
 
   step2[8] = step1[8];
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
+  step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
+  step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
 
-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
+  step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
+  step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
+  step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
+  step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
+  step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
+  step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
 
   // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
 
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2290,62 +2294,62 @@
   step1[31] = step2[31];
 
   // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
+  step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
+  step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
+  step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
+  step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
+  step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
+  step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
+  step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
+  step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
 
-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
+  step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
+  step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
+  step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
+  step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
+  step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
+  step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
+  step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
+  step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
 
   // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
+  step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
+  step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
+  step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
+  step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
+  step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
+  step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
+  step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
+  step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
+  step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
+  step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
+  step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
+  step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
+  step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
+  step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
+  step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
+  step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
 
   step1[16] = step2[16];
   step1[17] = step2[17];
@@ -2353,58 +2357,58 @@
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
+  step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd);
+  step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
   step1[31] = step2[31];
 
   // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = WRAPLOW(step1[0] - step1[31], bd);
+  output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
+  output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
+  output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
+  output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
+  output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
+  output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
+  output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
+  output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
+  output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
+  output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
+  output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
+  output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
+  output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
+  output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
+  output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
+  output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
+  output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
+  output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
+  output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
+  output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
+  output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
+  output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
+  output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
+  output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
+  output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
+  output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
+  output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
+  output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
+  output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
+  output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
+  output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
+  output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
 }
 
 void aom_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
@@ -2427,7 +2431,7 @@
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      highbd_idct32_c(input, outptr, bd);
+      aom_highbd_idct32_c(input, outptr, bd);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
@@ -2437,7 +2441,7 @@
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    aom_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2456,14 +2460,14 @@
   // Rows
   // Only upper-left 8x8 has non-zero coeff.
   for (i = 0; i < 8; ++i) {
-    highbd_idct32_c(input, outptr, bd);
+    aom_highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    aom_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2478,8 +2482,8 @@
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
+      HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {

diff --git a/aom_dsp/inv_txfm.h b/aom_dsp/inv_txfm.h
index 129307e..c3d794e 100644
--- a/aom_dsp/inv_txfm.h
+++ b/aom_dsp/inv_txfm.h

@@ -22,29 +22,29 @@
 extern "C" {
 #endif
 
-static INLINE tran_low_t check_range(tran_high_t input) {
+static INLINE tran_high_t check_range(tran_high_t input) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid input streams, intermediate stage coefficients should always
+  // For valid AV1 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
-  // of this range for invalid/corrupt streams. However, strictly checking
+  // of this range for invalid/corrupt AV1 streams. However, strictly checking
   // this range for every intermediate coefficient can burdensome for a decoder,
   // therefore the following assertion is only enabled when configured with
   // --enable-coefficient-range-checking.
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+static INLINE tran_high_t dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
+  return rv;
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input, int bd) {
+static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid highbitdepth streams, intermediate stage coefficients will
+  // For valid highbitdepth AV1 streams, intermediate stage coefficients will
   // stay within the ranges:
   // - 8 bit: signed 16 bit integer
   // - 10 bit: signed 18 bit integer
@@ -56,13 +56,12 @@
   (void)int_min;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   (void)bd;
-  return (tran_low_t)input;
+  return input;
 }
 
-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
+static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
+  return rv;
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
@@ -83,9 +82,19 @@
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
+
+#define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
+#if CONFIG_AOM_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) \
+  ((((int32_t)highbd_check_range((x), bd)) << (24 - bd)) >> (24 - bd))
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#else  // CONFIG_EMULATE_HARDWARE
+
+#define WRAPLOW(x) ((int32_t)check_range(x))
+#if CONFIG_AOM_HIGHBITDEPTH
+#define HIGHBD_WRAPLOW(x, bd) ((int32_t)highbd_check_range((x), bd))
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EMULATE_HARDWARE
 
 void aom_idct4_c(const tran_low_t *input, tran_low_t *output);
@@ -100,6 +109,7 @@
 void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
 void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
 
 void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
@@ -107,14 +117,14 @@
 
 static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
                                              int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+  trans = HIGHBD_WRAPLOW(trans, bd);
+  return clip_pixel_highbd(dest + (int)trans, bd);
 }
 #endif
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
+  trans = WRAPLOW(trans);
+  return clip_pixel(dest + (int)trans);
 }
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/aom_dsp/mips/add_noise_msa.c b/aom_dsp/mips/add_noise_msa.c
new file mode 100644
index 0000000..fe3510d
--- /dev/null
+++ b/aom_dsp/mips/add_noise_msa.c

@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], uint32_t width,
+                             uint32_t height, int32_t pitch) {
+  uint32_t i, j;
+
+  for (i = 0; i < height / 2; ++i) {
+    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
+    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
+    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
+    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
+    for (j = width / 16; j--;) {
+      v16i8 temp00_s, temp01_s;
+      v16u8 temp00, temp01, black_clamp, white_clamp;
+      v16u8 pos0, ref0, pos1, ref1;
+      v16i8 const127 = __msa_ldi_b(127);
+
+      pos0 = LD_UB(pos0_ptr);
+      ref0 = LD_UB(ref0_ptr);
+      pos1 = LD_UB(pos1_ptr);
+      ref1 = LD_UB(ref1_ptr);
+      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
+      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
+      temp00 = (pos0 < black_clamp);
+      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
+      temp01 = (pos1 < black_clamp);
+      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp00 = (v16u8)(temp00_s < pos0);
+      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
+      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
+      temp01 = (temp01_s < pos1);
+      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
+      XORI_B2_128_UB(pos0, pos1);
+      pos0 += ref0;
+      ST_UB(pos0, pos0_ptr);
+      pos1 += ref1;
+      ST_UB(pos1, pos1_ptr);
+      pos0_ptr += 16;
+      pos1_ptr += 16;
+      ref0_ptr += 16;
+      ref1_ptr += 16;
+    }
+  }
+}

diff --git a/aom_dsp/mips/deblock_msa.c b/aom_dsp/mips/deblock_msa.c
new file mode 100644
index 0000000..37e3e4a
--- /dev/null
+++ b/aom_dsp/mips/deblock_msa.c

@@ -0,0 +1,681 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "./macros_msa.h"
+
+extern const int16_t aom_rv[];
+
+#define AOM_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0,  \
+                                out1, out2, out3, out4, out5, out6, out7,      \
+                                out8, out9, out10, out11, out12, out13, out14, \
+                                out15)                                         \
+  {                                                                            \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                                   \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                                   \
+                                                                               \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
+               temp3);                                                         \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                   \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                   \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
+               temp3);                                                         \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                                    \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                                   \
+    out0 = (v16u8)temp6;                                                       \
+    out2 = (v16u8)temp7;                                                       \
+    out4 = (v16u8)temp8;                                                       \
+    out6 = (v16u8)temp9;                                                       \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                      \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                   \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                   \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                   \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                      \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                      \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                      \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                      \
+  }
+
+#define AOM_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
+                           ref, out)                                           \
+  {                                                                            \
+    v16u8 temp0, temp1;                                                        \
+                                                                               \
+    temp1 = __msa_aver_u_b(above2_in, above1_in);                              \
+    temp0 = __msa_aver_u_b(below2_in, below1_in);                              \
+    temp1 = __msa_aver_u_b(temp1, temp0);                                      \
+    out = __msa_aver_u_b(src_in, temp1);                                       \
+    temp0 = __msa_asub_u_b(src_in, above2_in);                                 \
+    temp1 = __msa_asub_u_b(src_in, above1_in);                                 \
+    temp0 = (temp0 < ref);                                                     \
+    temp1 = (temp1 < ref);                                                     \
+    temp0 = temp0 & temp1;                                                     \
+    temp1 = __msa_asub_u_b(src_in, below1_in);                                 \
+    temp1 = (temp1 < ref);                                                     \
+    temp0 = temp0 & temp1;                                                     \
+    temp1 = __msa_asub_u_b(src_in, below2_in);                                 \
+    temp1 = (temp1 < ref);                                                     \
+    temp0 = temp0 & temp1;                                                     \
+    out = __msa_bmz_v(out, src_in, temp0);                                     \
+  }
+
+#define TRANSPOSE12x16_B(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9,    \
+                         in10, in11, in12, in13, in14, in15)                  \
+  {                                                                           \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                                  \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                                  \
+                                                                              \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                             \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                                  \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                             \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                                  \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                                  \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                                  \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                           \
+    ILVR_B2_SH(in9, in8, in11, in10, temp4, temp5);                           \
+    ILVRL_H2_SH(temp5, temp4, temp6, temp7);                                  \
+    ILVR_B2_SH(in13, in12, in15, in14, temp4, temp5);                         \
+    ILVRL_H2_SH(temp5, temp4, temp8, temp9);                                  \
+    ILVRL_W2_SH(temp8, temp6, temp4, temp5);                                  \
+    ILVRL_W2_SH(temp9, temp7, temp6, temp7);                                  \
+    ILVL_B2_SH(in1, in0, in3, in2, temp8, temp9);                             \
+    ILVR_D2_UB(temp4, temp0, temp5, temp1, in0, in2);                         \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp4, (v2i64)temp0);                    \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp5, (v2i64)temp1);                    \
+    ILVL_B2_SH(in5, in4, in7, in6, temp0, temp1);                             \
+    ILVR_D2_UB(temp6, temp2, temp7, temp3, in4, in6);                         \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp2);                    \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp3);                    \
+    ILVL_B4_SH(in9, in8, in11, in10, in13, in12, in15, in14, temp2, temp3,    \
+               temp4, temp5);                                                 \
+    ILVR_H4_SH(temp9, temp8, temp1, temp0, temp3, temp2, temp5, temp4, temp6, \
+               temp7, temp8, temp9);                                          \
+    ILVR_W2_SH(temp7, temp6, temp9, temp8, temp0, temp1);                     \
+    in8 = (v16u8)__msa_ilvr_d((v2i64)temp1, (v2i64)temp0);                    \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp0);                    \
+    ILVL_W2_SH(temp7, temp6, temp9, temp8, temp2, temp3);                     \
+    in10 = (v16u8)__msa_ilvr_d((v2i64)temp3, (v2i64)temp2);                   \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp2);                   \
+  }
+
+#define AOM_TRANSPOSE12x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, \
+                                in9, in10, in11)                             \
+  {                                                                          \
+    v8i16 temp0, temp1, temp2, temp3;                                        \
+    v8i16 temp4, temp5, temp6, temp7;                                        \
+                                                                             \
+    ILVR_B2_SH(in1, in0, in3, in2, temp0, temp1);                            \
+    ILVRL_H2_SH(temp1, temp0, temp2, temp3);                                 \
+    ILVR_B2_SH(in5, in4, in7, in6, temp0, temp1);                            \
+    ILVRL_H2_SH(temp1, temp0, temp4, temp5);                                 \
+    ILVRL_W2_SH(temp4, temp2, temp0, temp1);                                 \
+    ILVRL_W2_SH(temp5, temp3, temp2, temp3);                                 \
+    ILVL_B2_SH(in1, in0, in3, in2, temp4, temp5);                            \
+    temp4 = __msa_ilvr_h(temp5, temp4);                                      \
+    ILVL_B2_SH(in5, in4, in7, in6, temp6, temp7);                            \
+    temp5 = __msa_ilvr_h(temp7, temp6);                                      \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                 \
+    in0 = (v16u8)temp0;                                                      \
+    in2 = (v16u8)temp1;                                                      \
+    in4 = (v16u8)temp2;                                                      \
+    in6 = (v16u8)temp3;                                                      \
+    in8 = (v16u8)temp6;                                                      \
+    in10 = (v16u8)temp7;                                                     \
+    in1 = (v16u8)__msa_ilvl_d((v2i64)temp0, (v2i64)temp0);                   \
+    in3 = (v16u8)__msa_ilvl_d((v2i64)temp1, (v2i64)temp1);                   \
+    in5 = (v16u8)__msa_ilvl_d((v2i64)temp2, (v2i64)temp2);                   \
+    in7 = (v16u8)__msa_ilvl_d((v2i64)temp3, (v2i64)temp3);                   \
+    in9 = (v16u8)__msa_ilvl_d((v2i64)temp6, (v2i64)temp6);                   \
+    in11 = (v16u8)__msa_ilvl_d((v2i64)temp7, (v2i64)temp7);                  \
+  }
+
+static void postproc_down_across_chroma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                            int32_t src_stride,
+                                            int32_t dst_stride, int32_t cols,
+                                            uint8_t *f) {
+  uint8_t *p_src = src_ptr;
+  uint8_t *p_dst = dst_ptr;
+  uint8_t *f_orig = f;
+  uint8_t *p_dst_st = dst_ptr;
+  uint16_t col;
+  uint64_t out0, out1, out2, out3;
+  v16u8 above2, above1, below2, below1, src, ref, ref_temp;
+  v16u8 inter0, inter1, inter2, inter3, inter4, inter5;
+  v16u8 inter6, inter7, inter8, inter9, inter10, inter11;
+
+  for (col = (cols / 16); col--;) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+           p_dst, dst_stride);
+
+    p_dst += 16;
+    p_src += 16;
+    f += 16;
+  }
+
+  if (0 != (cols / 16)) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    out0 = __msa_copy_u_d((v2i64)inter0, 0);
+    out1 = __msa_copy_u_d((v2i64)inter1, 0);
+    out2 = __msa_copy_u_d((v2i64)inter2, 0);
+    out3 = __msa_copy_u_d((v2i64)inter3, 0);
+    SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+    out0 = __msa_copy_u_d((v2i64)inter4, 0);
+    out1 = __msa_copy_u_d((v2i64)inter5, 0);
+    out2 = __msa_copy_u_d((v2i64)inter6, 0);
+    out3 = __msa_copy_u_d((v2i64)inter7, 0);
+    SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+  }
+
+  f = f_orig;
+  p_dst = dst_ptr - 2;
+  LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+         inter6, inter7);
+
+  for (col = 0; col < (cols / 8); ++col) {
+    ref = LD_UB(f);
+    f += 8;
+    AOM_TRANSPOSE12x8_UB_UB(inter0, inter1, inter2, inter3, inter4, inter5,
+                            inter6, inter7, inter8, inter9, inter10, inter11);
+    if (0 == col) {
+      above2 = inter2;
+      above1 = inter2;
+    } else {
+      above2 = inter0;
+      above1 = inter1;
+    }
+    src = inter2;
+    below1 = inter3;
+    below2 = inter4;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+    above2 = inter5;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+    above1 = inter6;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+    src = inter7;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+    below1 = inter8;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+    below2 = inter9;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+    if (col == (cols / 8 - 1)) {
+      above2 = inter9;
+    } else {
+      above2 = inter10;
+    }
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+    if (col == (cols / 8 - 1)) {
+      above1 = inter9;
+    } else {
+      above1 = inter11;
+    }
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+    TRANSPOSE8x8_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7, inter8,
+                       inter9, inter2, inter3, inter4, inter5, inter6, inter7,
+                       inter8, inter9);
+    p_dst += 8;
+    LD_UB2(p_dst, dst_stride, inter0, inter1);
+    ST8x1_UB(inter2, p_dst_st);
+    ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+    LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+    ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+    ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+    LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+    ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+    ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+    LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+    ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+    ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+    p_dst_st += 8;
+  }
+}
+
+static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
+                                          int32_t src_stride,
+                                          int32_t dst_stride, int32_t cols,
+                                          uint8_t *f) {
+  uint8_t *p_src = src_ptr;
+  uint8_t *p_dst = dst_ptr;
+  uint8_t *p_dst_st = dst_ptr;
+  uint8_t *f_orig = f;
+  uint16_t col;
+  v16u8 above2, above1, below2, below1;
+  v16u8 src, ref, ref_temp;
+  v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
+  v16u8 inter7, inter8, inter9, inter10, inter11;
+  v16u8 inter12, inter13, inter14, inter15;
+
+  for (col = (cols / 16); col--;) {
+    ref = LD_UB(f);
+    LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+    src = LD_UB(p_src);
+    LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+    above2 = LD_UB(p_src + 3 * src_stride);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+    above1 = LD_UB(p_src + 4 * src_stride);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+    src = LD_UB(p_src + 5 * src_stride);
+    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+    below1 = LD_UB(p_src + 6 * src_stride);
+    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+    below2 = LD_UB(p_src + 7 * src_stride);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+    above2 = LD_UB(p_src + 8 * src_stride);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+    above1 = LD_UB(p_src + 9 * src_stride);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+    src = LD_UB(p_src + 10 * src_stride);
+    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+    below1 = LD_UB(p_src + 11 * src_stride);
+    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+    below2 = LD_UB(p_src + 12 * src_stride);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+    above2 = LD_UB(p_src + 13 * src_stride);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+    above1 = LD_UB(p_src + 14 * src_stride);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+    src = LD_UB(p_src + 15 * src_stride);
+    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+    below1 = LD_UB(p_src + 16 * src_stride);
+    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+    below2 = LD_UB(p_src + 17 * src_stride);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+    ST_UB8(inter0, inter1, inter2, inter3, inter4, inter5, inter6, inter7,
+           p_dst, dst_stride);
+    ST_UB8(inter8, inter9, inter10, inter11, inter12, inter13, inter14, inter15,
+           p_dst + 8 * dst_stride, dst_stride);
+    p_src += 16;
+    p_dst += 16;
+    f += 16;
+  }
+
+  f = f_orig;
+  p_dst = dst_ptr - 2;
+  LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
+         inter6, inter7);
+  LD_UB8(p_dst + 8 * dst_stride, dst_stride, inter8, inter9, inter10, inter11,
+         inter12, inter13, inter14, inter15);
+
+  for (col = 0; col < cols / 8; ++col) {
+    ref = LD_UB(f);
+    f += 8;
+    TRANSPOSE12x16_B(inter0, inter1, inter2, inter3, inter4, inter5, inter6,
+                     inter7, inter8, inter9, inter10, inter11, inter12, inter13,
+                     inter14, inter15);
+    if (0 == col) {
+      above2 = inter2;
+      above1 = inter2;
+    } else {
+      above2 = inter0;
+      above1 = inter1;
+    }
+
+    src = inter2;
+    below1 = inter3;
+    below2 = inter4;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 0);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter2);
+    above2 = inter5;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 1);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter3);
+    above1 = inter6;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 2);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter4);
+    src = inter7;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 3);
+    AOM_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref_temp, inter5);
+    below1 = inter8;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 4);
+    AOM_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref_temp, inter6);
+    below2 = inter9;
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 5);
+    AOM_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref_temp, inter7);
+    if (col == (cols / 8 - 1)) {
+      above2 = inter9;
+    } else {
+      above2 = inter10;
+    }
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 6);
+    AOM_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref_temp, inter8);
+    if (col == (cols / 8 - 1)) {
+      above1 = inter9;
+    } else {
+      above1 = inter11;
+    }
+    ref_temp = (v16u8)__msa_splati_b((v16i8)ref, 7);
+    AOM_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref_temp, inter9);
+    AOM_TRANSPOSE8x16_UB_UB(inter2, inter3, inter4, inter5, inter6, inter7,
+                            inter8, inter9, inter2, inter3, inter4, inter5,
+                            inter6, inter7, inter8, inter9, inter10, inter11,
+                            inter12, inter13, inter14, inter15, above2, above1);
+
+    p_dst += 8;
+    LD_UB2(p_dst, dst_stride, inter0, inter1);
+    ST8x1_UB(inter2, p_dst_st);
+    ST8x1_UB(inter3, (p_dst_st + 1 * dst_stride));
+    LD_UB2(p_dst + 2 * dst_stride, dst_stride, inter2, inter3);
+    ST8x1_UB(inter4, (p_dst_st + 2 * dst_stride));
+    ST8x1_UB(inter5, (p_dst_st + 3 * dst_stride));
+    LD_UB2(p_dst + 4 * dst_stride, dst_stride, inter4, inter5);
+    ST8x1_UB(inter6, (p_dst_st + 4 * dst_stride));
+    ST8x1_UB(inter7, (p_dst_st + 5 * dst_stride));
+    LD_UB2(p_dst + 6 * dst_stride, dst_stride, inter6, inter7);
+    ST8x1_UB(inter8, (p_dst_st + 6 * dst_stride));
+    ST8x1_UB(inter9, (p_dst_st + 7 * dst_stride));
+    LD_UB2(p_dst + 8 * dst_stride, dst_stride, inter8, inter9);
+    ST8x1_UB(inter10, (p_dst_st + 8 * dst_stride));
+    ST8x1_UB(inter11, (p_dst_st + 9 * dst_stride));
+    LD_UB2(p_dst + 10 * dst_stride, dst_stride, inter10, inter11);
+    ST8x1_UB(inter12, (p_dst_st + 10 * dst_stride));
+    ST8x1_UB(inter13, (p_dst_st + 11 * dst_stride));
+    LD_UB2(p_dst + 12 * dst_stride, dst_stride, inter12, inter13);
+    ST8x1_UB(inter14, (p_dst_st + 12 * dst_stride));
+    ST8x1_UB(inter15, (p_dst_st + 13 * dst_stride));
+    LD_UB2(p_dst + 14 * dst_stride, dst_stride, inter14, inter15);
+    ST8x1_UB(above2, (p_dst_st + 14 * dst_stride));
+    ST8x1_UB(above1, (p_dst_st + 15 * dst_stride));
+    p_dst_st += 8;
+  }
+}
+
+void aom_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
+                                              int32_t src_stride,
+                                              int32_t dst_stride, int32_t cols,
+                                              uint8_t *f, int32_t size) {
+  if (8 == size) {
+    postproc_down_across_chroma_msa(src, dst, src_stride, dst_stride, cols, f);
+  } else if (16 == size) {
+    postproc_down_across_luma_msa(src, dst, src_stride, dst_stride, cols, f);
+  }
+}
+
+void aom_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
+                                   int32_t rows, int32_t cols, int32_t flimit) {
+  int32_t row, col, cnt;
+  uint8_t *src_dup = src_ptr;
+  v16u8 src0, src, tmp_orig;
+  v16u8 tmp = { 0 };
+  v16i8 zero = { 0 };
+  v8u16 sum_h, src_r_h, src_l_h;
+  v4u32 src_r_w, src_l_w;
+  v4i32 flimit_vec;
+
+  flimit_vec = __msa_fill_w(flimit);
+  for (row = rows; row--;) {
+    int32_t sum_sq = 0;
+    int32_t sum = 0;
+    src0 = (v16u8)__msa_fill_b(src_dup[0]);
+    ST8x1_UB(src0, (src_dup - 8));
+
+    src0 = (v16u8)__msa_fill_b(src_dup[cols - 1]);
+    ST_UB(src0, src_dup + cols);
+    src_dup[cols + 16] = src_dup[cols - 1];
+    tmp_orig = (v16u8)__msa_ldi_b(0);
+    tmp_orig[15] = tmp[15];
+    src = LD_UB(src_dup - 8);
+    src[15] = 0;
+    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+    src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
+    src_l_w = __msa_dotp_u_w(src_l_h, src_l_h);
+    sum_sq = HADD_SW_S32(src_r_w);
+    sum_sq += HADD_SW_S32(src_l_w);
+    sum_h = __msa_hadd_u_h(src, src);
+    sum = HADD_UH_U32(sum_h);
+    {
+      v16u8 src7, src8, src_r, src_l;
+      v16i8 mask;
+      v8u16 add_r, add_l;
+      v8i16 sub_r, sub_l, sum_r, sum_l, mask0, mask1;
+      v4i32 sum_sq0, sum_sq1, sum_sq2, sum_sq3;
+      v4i32 sub0, sub1, sub2, sub3;
+      v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+      v4i32 mul0, mul1, mul2, mul3;
+      v4i32 total0, total1, total2, total3;
+      v8i16 const8 = __msa_fill_h(8);
+
+      src7 = LD_UB(src_dup + 7);
+      src8 = LD_UB(src_dup - 8);
+      for (col = 0; col < (cols >> 4); ++col) {
+        ILVRL_B2_UB(src7, src8, src_r, src_l);
+        HSUB_UB2_SH(src_r, src_l, sub_r, sub_l);
+
+        sum_r[0] = sum + sub_r[0];
+        for (cnt = 0; cnt < 7; ++cnt) {
+          sum_r[cnt + 1] = sum_r[cnt] + sub_r[cnt + 1];
+        }
+        sum_l[0] = sum_r[7] + sub_l[0];
+        for (cnt = 0; cnt < 7; ++cnt) {
+          sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
+        }
+        sum = sum_l[7];
+        src = LD_UB(src_dup + 16 * col);
+        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
+        src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
+        tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
+
+        HADD_UB2_UH(src_r, src_l, add_r, add_l);
+        UNPCK_SH_SW(sub_r, sub0, sub1);
+        UNPCK_SH_SW(sub_l, sub2, sub3);
+        ILVR_H2_SW(zero, add_r, zero, add_l, sum0_w, sum2_w);
+        ILVL_H2_SW(zero, add_r, zero, add_l, sum1_w, sum3_w);
+        MUL4(sum0_w, sub0, sum1_w, sub1, sum2_w, sub2, sum3_w, sub3, mul0, mul1,
+             mul2, mul3);
+        sum_sq0[0] = sum_sq + mul0[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq0[cnt + 1] = sum_sq0[cnt] + mul0[cnt + 1];
+        }
+        sum_sq1[0] = sum_sq0[3] + mul1[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq1[cnt + 1] = sum_sq1[cnt] + mul1[cnt + 1];
+        }
+        sum_sq2[0] = sum_sq1[3] + mul2[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq2[cnt + 1] = sum_sq2[cnt] + mul2[cnt + 1];
+        }
+        sum_sq3[0] = sum_sq2[3] + mul3[0];
+        for (cnt = 0; cnt < 3; ++cnt) {
+          sum_sq3[cnt + 1] = sum_sq3[cnt] + mul3[cnt + 1];
+        }
+        sum_sq = sum_sq3[3];
+
+        UNPCK_SH_SW(sum_r, sum0_w, sum1_w);
+        UNPCK_SH_SW(sum_l, sum2_w, sum3_w);
+        total0 = sum_sq0 * __msa_ldi_w(15);
+        total0 -= sum0_w * sum0_w;
+        total1 = sum_sq1 * __msa_ldi_w(15);
+        total1 -= sum1_w * sum1_w;
+        total2 = sum_sq2 * __msa_ldi_w(15);
+        total2 -= sum2_w * sum2_w;
+        total3 = sum_sq3 * __msa_ldi_w(15);
+        total3 -= sum3_w * sum3_w;
+        total0 = (total0 < flimit_vec);
+        total1 = (total1 < flimit_vec);
+        total2 = (total2 < flimit_vec);
+        total3 = (total3 < flimit_vec);
+        PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+        mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+
+        if (col == 0) {
+          uint64_t src_d;
+
+          src_d = __msa_copy_u_d((v2i64)tmp_orig, 1);
+          SD(src_d, (src_dup - 8));
+        }
+
+        src7 = LD_UB(src_dup + 16 * (col + 1) + 7);
+        src8 = LD_UB(src_dup + 16 * (col + 1) - 8);
+        ST_UB(tmp, (src_dup + (16 * col)));
+      }
+
+      src_dup += pitch;
+    }
+  }
+}
+
+void aom_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows,
+                              int32_t cols, int32_t flimit) {
+  int32_t row, col, cnt, i;
+  const int16_t *rv3 = &aom_rv[63 & rand()];
+  v4i32 flimit_vec;
+  v16u8 dst7, dst8, dst_r_b, dst_l_b;
+  v16i8 mask;
+  v8u16 add_r, add_l;
+  v8i16 dst_r_h, dst_l_h, sub_r, sub_l, mask0, mask1;
+  v4i32 sub0, sub1, sub2, sub3, total0, total1, total2, total3;
+
+  flimit_vec = __msa_fill_w(flimit);
+
+  for (col = 0; col < (cols >> 4); ++col) {
+    uint8_t *dst_tmp = &dst_ptr[col << 4];
+    v16u8 dst;
+    v16i8 zero = { 0 };
+    v16u8 tmp[16];
+    v8i16 mult0, mult1, rv2_0, rv2_1;
+    v8i16 sum0_h = { 0 };
+    v8i16 sum1_h = { 0 };
+    v4i32 mul0 = { 0 };
+    v4i32 mul1 = { 0 };
+    v4i32 mul2 = { 0 };
+    v4i32 mul3 = { 0 };
+    v4i32 sum0_w, sum1_w, sum2_w, sum3_w;
+    v4i32 add0, add1, add2, add3;
+    const int16_t *rv2[16];
+
+    dst = LD_UB(dst_tmp);
+    for (cnt = (col << 4), i = 0; i < 16; ++cnt) {
+      rv2[i] = rv3 + ((cnt * 17) & 127);
+      ++i;
+    }
+    for (cnt = -8; cnt < 0; ++cnt) {
+      ST_UB(dst, dst_tmp + cnt * pitch);
+    }
+
+    dst = LD_UB((dst_tmp + (rows - 1) * pitch));
+    for (cnt = rows; cnt < rows + 17; ++cnt) {
+      ST_UB(dst, dst_tmp + cnt * pitch);
+    }
+    for (cnt = -8; cnt <= 6; ++cnt) {
+      dst = LD_UB(dst_tmp + (cnt * pitch));
+      UNPCK_UB_SH(dst, dst_r_h, dst_l_h);
+      MUL2(dst_r_h, dst_r_h, dst_l_h, dst_l_h, mult0, mult1);
+      mul0 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult0);
+      mul1 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult0);
+      mul2 += (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)mult1);
+      mul3 += (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)mult1);
+      ADD2(sum0_h, dst_r_h, sum1_h, dst_l_h, sum0_h, sum1_h);
+    }
+
+    for (row = 0; row < (rows + 8); ++row) {
+      for (i = 0; i < 8; ++i) {
+        rv2_0[i] = *(rv2[i] + (row & 127));
+        rv2_1[i] = *(rv2[i + 8] + (row & 127));
+      }
+      dst7 = LD_UB(dst_tmp + (7 * pitch));
+      dst8 = LD_UB(dst_tmp - (8 * pitch));
+      ILVRL_B2_UB(dst7, dst8, dst_r_b, dst_l_b);
+
+      HSUB_UB2_SH(dst_r_b, dst_l_b, sub_r, sub_l);
+      UNPCK_SH_SW(sub_r, sub0, sub1);
+      UNPCK_SH_SW(sub_l, sub2, sub3);
+      sum0_h += sub_r;
+      sum1_h += sub_l;
+
+      HADD_UB2_UH(dst_r_b, dst_l_b, add_r, add_l);
+
+      ILVRL_H2_SW(zero, add_r, add0, add1);
+      ILVRL_H2_SW(zero, add_l, add2, add3);
+      mul0 += add0 * sub0;
+      mul1 += add1 * sub1;
+      mul2 += add2 * sub2;
+      mul3 += add3 * sub3;
+      dst = LD_UB(dst_tmp);
+      ILVRL_B2_SH(zero, dst, dst_r_h, dst_l_h);
+      dst7 = (v16u8)((rv2_0 + sum0_h + dst_r_h) >> 4);
+      dst8 = (v16u8)((rv2_1 + sum1_h + dst_l_h) >> 4);
+      tmp[row & 15] = (v16u8)__msa_pckev_b((v16i8)dst8, (v16i8)dst7);
+
+      UNPCK_SH_SW(sum0_h, sum0_w, sum1_w);
+      UNPCK_SH_SW(sum1_h, sum2_w, sum3_w);
+      total0 = mul0 * __msa_ldi_w(15);
+      total0 -= sum0_w * sum0_w;
+      total1 = mul1 * __msa_ldi_w(15);
+      total1 -= sum1_w * sum1_w;
+      total2 = mul2 * __msa_ldi_w(15);
+      total2 -= sum2_w * sum2_w;
+      total3 = mul3 * __msa_ldi_w(15);
+      total3 -= sum3_w * sum3_w;
+      total0 = (total0 < flimit_vec);
+      total1 = (total1 < flimit_vec);
+      total2 = (total2 < flimit_vec);
+      total3 = (total3 < flimit_vec);
+      PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
+      mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
+      tmp[row & 15] = __msa_bmz_v(tmp[row & 15], dst, (v16u8)mask);
+
+      if (row >= 8) {
+        ST_UB(tmp[(row - 8) & 15], (dst_tmp - 8 * pitch));
+      }
+
+      dst_tmp += pitch;
+    }
+  }
+}

diff --git a/aom_dsp/mips/fwd_dct32x32_msa.c b/aom_dsp/mips/fwd_dct32x32_msa.c
index 2fae3a7..dc9c632 100644
--- a/aom_dsp/mips/fwd_dct32x32_msa.c
+++ b/aom_dsp/mips/fwd_dct32x32_msa.c

@@ -928,23 +928,21 @@
 }
 
 void aom_fdct32x32_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[1] = 0;
-
-  out[0] = LD_HADD(input, stride);
-  out[0] += LD_HADD(input + 8, stride);
-  out[0] += LD_HADD(input + 16, stride);
-  out[0] += LD_HADD(input + 24, stride);
-  out[0] += LD_HADD(input + 32 * 8, stride);
-  out[0] += LD_HADD(input + 32 * 8 + 8, stride);
-  out[0] += LD_HADD(input + 32 * 8 + 16, stride);
-  out[0] += LD_HADD(input + 32 * 8 + 24, stride);
-  out[0] += LD_HADD(input + 32 * 16, stride);
-  out[0] += LD_HADD(input + 32 * 16 + 8, stride);
-  out[0] += LD_HADD(input + 32 * 16 + 16, stride);
-  out[0] += LD_HADD(input + 32 * 16 + 24, stride);
-  out[0] += LD_HADD(input + 32 * 24, stride);
-  out[0] += LD_HADD(input + 32 * 24 + 8, stride);
-  out[0] += LD_HADD(input + 32 * 24 + 16, stride);
-  out[0] += LD_HADD(input + 32 * 24 + 24, stride);
-  out[0] >>= 3;
+  int sum = LD_HADD(input, stride);
+  sum += LD_HADD(input + 8, stride);
+  sum += LD_HADD(input + 16, stride);
+  sum += LD_HADD(input + 24, stride);
+  sum += LD_HADD(input + 32 * 8, stride);
+  sum += LD_HADD(input + 32 * 8 + 8, stride);
+  sum += LD_HADD(input + 32 * 8 + 16, stride);
+  sum += LD_HADD(input + 32 * 8 + 24, stride);
+  sum += LD_HADD(input + 32 * 16, stride);
+  sum += LD_HADD(input + 32 * 16 + 8, stride);
+  sum += LD_HADD(input + 32 * 16 + 16, stride);
+  sum += LD_HADD(input + 32 * 16 + 24, stride);
+  sum += LD_HADD(input + 32 * 24, stride);
+  sum += LD_HADD(input + 32 * 24 + 8, stride);
+  sum += LD_HADD(input + 32 * 24 + 16, stride);
+  sum += LD_HADD(input + 32 * 24 + 24, stride);
+  out[0] = (int16_t)(sum >> 3);
 }

diff --git a/aom_dsp/mips/fwd_txfm_msa.c b/aom_dsp/mips/fwd_txfm_msa.c
index df8c7c8..f16d290 100644
--- a/aom_dsp/mips/fwd_txfm_msa.c
+++ b/aom_dsp/mips/fwd_txfm_msa.c

@@ -238,11 +238,9 @@
 }
 
 void aom_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  out[1] = 0;
-
-  out[0] = LD_HADD(input, stride);
-  out[0] += LD_HADD(input + 8, stride);
-  out[0] += LD_HADD(input + 16 * 8, stride);
-  out[0] += LD_HADD(input + 16 * 8 + 8, stride);
-  out[0] >>= 1;
+  int sum = LD_HADD(input, stride);
+  sum += LD_HADD(input + 8, stride);
+  sum += LD_HADD(input + 16 * 8, stride);
+  sum += LD_HADD(input + 16 * 8 + 8, stride);
+  out[0] = (int16_t)(sum >> 1);
 }

diff --git a/aom_dsp/mips/macros_msa.h b/aom_dsp/mips/macros_msa.h
index 9a9fe01..48fbcfd 100644
--- a/aom_dsp/mips/macros_msa.h
+++ b/aom_dsp/mips/macros_msa.h

@@ -1119,6 +1119,7 @@
     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
   }
 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
 
 /* Description : Interleave left half of halfword elements from vectors
@@ -1134,6 +1135,7 @@
     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
   }
 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
 
 /* Description : Interleave left half of word elements from vectors
    Arguments   : Inputs  - in0, in1, in2, in3
@@ -1202,6 +1204,7 @@
     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
   }
 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
 
 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                 out2, out3)                                                \
@@ -1289,6 +1292,7 @@
     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
   }
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 

diff --git a/aom_dsp/postproc.h b/aom_dsp/postproc.h
new file mode 100644
index 0000000..f78a472
--- /dev/null
+++ b/aom_dsp/postproc.h

@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_DSP_POSTPROC_H_
+#define AOM_DSP_POSTPROC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Fills a noise buffer with gaussian noise strength determined by sigma.
+int aom_setup_noise(double sigma, int size, char *noise);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_DSP_POSTPROC_H_

diff --git a/aom_dsp/psnr.c b/aom_dsp/psnr.c
index bcf4e64..93899ba 100644
--- a/aom_dsp/psnr.c
+++ b/aom_dsp/psnr.c

@@ -123,9 +123,9 @@
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
-int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                             int b_stride, int width, int height,
-                             unsigned int input_shift) {
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int width,
+                                    int height, unsigned int input_shift) {
   const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
   int64_t total_sse = 0;
@@ -142,8 +142,8 @@
   return total_sse;
 }
 
-int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
-                       int b_stride, int width, int height) {
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
   int64_t total_sse = 0;
   int x, y;
   const int dw = width % 16;
@@ -177,6 +177,14 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+                 b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
+                 width, height);
+}
+
 int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
                       const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
@@ -186,7 +194,33 @@
                  a->y_crop_width, a->y_crop_height);
 }
 
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
 #if CONFIG_AOM_HIGHBITDEPTH
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(
+      a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+      b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
+}
+
 int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
                              const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
@@ -198,6 +232,30 @@
                         a->y_crop_width, a->y_crop_height);
 }
 
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#if CONFIG_AOM_HIGHBITDEPTH
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           uint32_t bit_depth, uint32_t in_bit_depth) {
@@ -244,7 +302,7 @@
       aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
 }
 
-#endif
+#endif  // !CONFIG_AOM_HIGHBITDEPTH
 
 void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                    PSNR_STATS *psnr) {

diff --git a/aom_dsp/psnr.h b/aom_dsp/psnr.h
index ab9c361..1cd6b19 100644
--- a/aom_dsp/psnr.h
+++ b/aom_dsp/psnr.h

@@ -35,12 +35,22 @@
 * \param[in]    sse           Sum of squared errors
 */
 double aom_sse_to_psnr(double samples, double peak, double sse);
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
 int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_AOM_HIGHBITDEPTH
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
 int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
                              const YV12_BUFFER_CONFIG *b);
-
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
 void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                           const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
                           unsigned int bit_depth, unsigned int in_bit_depth);
@@ -54,4 +64,4 @@
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // AOM_DSP_PSNR_H_
\ No newline at end of file
+#endif  // AOM_DSP_PSNR_H_

diff --git a/aom_dsp/quantize.h b/aom_dsp/quantize.h
index 7260eb2..45ed678 100644
--- a/aom_dsp/quantize.h
+++ b/aom_dsp/quantize.h

@@ -68,13 +68,7 @@
                            const int16_t *round_ptr, const int16_t quant_ptr,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr);
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan);
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void aom_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                             int skip_block, const int16_t *round_ptr,
@@ -87,13 +81,6 @@
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant_ptr, uint16_t *eob_ptr);
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan);
 #endif
 #endif
 

diff --git a/aom_dsp/sad.c b/aom_dsp/sad.c
index fbe5acf..2172b6c 100644
--- a/aom_dsp/sad.c
+++ b/aom_dsp/sad.c

@@ -32,47 +32,6 @@
   return sad;
 }
 
-// TODO(johannkoenig): this moved to aom_dsp, should be able to clean this up.
-/* Remove dependency on av1 variance function by duplicating av1_comp_avg_pred.
- * The function averages every corresponding element of the buffers and stores
- * the value in a third buffer, comp_pred.
- * pred and comp_pred are assumed to have stride = width
- * In the usage below comp_pred is a local array.
- */
-static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                            int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                                   int width, int height, const uint8_t *ref8,
-                                   int ref_stride) {
-  int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
 #define sadMxN(m, n)                                                        \
   unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
                                     const uint8_t *ref, int ref_stride) {   \
@@ -82,7 +41,7 @@
                                         const uint8_t *ref, int ref_stride, \
                                         const uint8_t *second_pred) {       \
     uint8_t comp_pred[m * n];                                               \
-    avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);                \
+    aom_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
     return sad(src, src_stride, comp_pred, m, m, n);                        \
   }
 
@@ -110,6 +69,22 @@
   }
 
 /* clang-format off */
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+// 128x128
+sadMxN(128, 128)
+sadMxNxK(128, 128, 3)
+sadMxNxK(128, 128, 8)
+sadMxNx4D(128, 128)
+
+// 128x64
+sadMxN(128, 64)
+sadMxNx4D(128, 64)
+
+// 64x128
+sadMxN(64, 128)
+sadMxNx4D(64, 128)
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
 // 64x64
 sadMxN(64, 64)
 sadMxNxK(64, 64, 3)
@@ -221,7 +196,7 @@
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
       const uint8_t *second_pred) {                                            \
     uint16_t comp_pred[m * n];                                                 \
-    highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);            \
+    aom_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
     return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
   }
 
@@ -248,6 +223,22 @@
   }
 
 /* clang-format off */
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+// 128x128
+highbd_sadMxN(128, 128)
+highbd_sadMxNxK(128, 128, 3)
+highbd_sadMxNxK(128, 128, 8)
+highbd_sadMxNx4D(128, 128)
+
+// 128x64
+highbd_sadMxN(128, 64)
+highbd_sadMxNx4D(128, 64)
+
+// 64x128
+highbd_sadMxN(64, 128)
+highbd_sadMxNx4D(64, 128)
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
 // 64x64
 highbd_sadMxN(64, 64)
 highbd_sadMxNxK(64, 64, 3)
@@ -316,17 +307,116 @@
 highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)
 /* clang-format on */
-
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-#if CONFIG_MOTION_VAR
-    // pre: predictor being evaluated
-    // wsrc: target weighted prediction (has been *4096 to keep precision)
-    // mask: 2d weights (scaled by 4096)
-    static INLINE
-    unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
-                          const int32_t *wsrc, const int32_t *mask, int width,
-                          int height) {
+#if CONFIG_AV1 && CONFIG_EXT_INTER
+            static INLINE
+    unsigned int masked_sad(const uint8_t *a, int a_stride, const uint8_t *b,
+                            int b_stride, const uint8_t *m, int m_stride,
+                            int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define MASKSADMxN(m, n)                                                      \
+  unsigned int aom_masked_sad##m##x##n##_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m,   \
+                      n);                                                     \
+  }
+
+/* clang-format off */
+#if CONFIG_EXT_PARTITION
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+/* clang-format on */
+
+#if CONFIG_AOM_HIGHBITDEPTH
+                    static INLINE
+    unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
+                                   const uint8_t *b8, int b_stride,
+                                   const uint8_t *m, int m_stride, int width,
+                                   int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) sad += m[x] * abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n)                                               \
+  unsigned int aom_highbd_masked_sad##m##x##n##_c(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad(src, src_stride, ref, ref_stride, msk,           \
+                             msk_stride, m, n);                               \
+  }
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
+
+#if CONFIG_AV1 && CONFIG_MOTION_VAR
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    int width, int height) {
   int y, x;
   unsigned int sad = 0;
 
@@ -342,19 +432,36 @@
   return sad;
 }
 
-#define OBMC_SADMxN(m, n)                                                    \
+#define OBMCSADMxN(m, n)                                                     \
   unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
                                          const int32_t *wsrc,                \
                                          const int32_t *mask) {              \
     return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
   }
 
-OBMC_SADMxN(64, 64) OBMC_SADMxN(64, 32) OBMC_SADMxN(32, 64) OBMC_SADMxN(32, 32)
-    OBMC_SADMxN(32, 16) OBMC_SADMxN(16, 32) OBMC_SADMxN(16, 16)
-        OBMC_SADMxN(16, 8) OBMC_SADMxN(8, 16) OBMC_SADMxN(8, 8)
-            OBMC_SADMxN(8, 4) OBMC_SADMxN(4, 8) OBMC_SADMxN(4, 4)
+/* clang-format off */
+#if CONFIG_EXT_PARTITION
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+/* clang-format on */
+
 #if CONFIG_AOM_HIGHBITDEPTH
-                static INLINE
+                    static INLINE
     unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int width, int height) {
@@ -374,25 +481,32 @@
   return sad;
 }
 
-#define HIGHBD_OBMC_SADMXN(m, n)                               \
+#define HIGHBD_OBMCSADMXN(m, n)                                \
   unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
       const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
       const int32_t *mask) {                                   \
     return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
   }
 
-HIGHBD_OBMC_SADMXN(64, 64)
-HIGHBD_OBMC_SADMXN(64, 32)
-HIGHBD_OBMC_SADMXN(32, 64)
-HIGHBD_OBMC_SADMXN(32, 32)
-HIGHBD_OBMC_SADMXN(32, 16)
-HIGHBD_OBMC_SADMXN(16, 32)
-HIGHBD_OBMC_SADMXN(16, 16)
-HIGHBD_OBMC_SADMXN(16, 8)
-HIGHBD_OBMC_SADMXN(8, 16)
-HIGHBD_OBMC_SADMXN(8, 8)
-HIGHBD_OBMC_SADMXN(8, 4)
-HIGHBD_OBMC_SADMXN(4, 8)
-HIGHBD_OBMC_SADMXN(4, 4)
+/* clang-format off */
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+/* clang-format on */
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR

diff --git a/aom_dsp/sum_squares.c b/aom_dsp/sum_squares.c
new file mode 100644
index 0000000..6b71d44
--- /dev/null
+++ b/aom_dsp/sum_squares.c

@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+
+uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
+                                  int size) {
+  int r, c;
+  uint64_t ss = 0;
+
+  for (r = 0; r < size; r++) {
+    for (c = 0; c < size; c++) {
+      const int16_t v = src[c];
+      ss += v * v;
+    }
+    src += src_stride;
+  }
+
+  return ss;
+}
+
+uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
+  uint64_t ss = 0;
+  do {
+    const int16_t v = *src++;
+    ss += v * v;
+  } while (--n);
+
+  return ss;
+}

diff --git a/aom_dsp/txfm_common.h b/aom_dsp/txfm_common.h
index 30ee66f..a5e964a 100644
--- a/aom_dsp/txfm_common.h
+++ b/aom_dsp/txfm_common.h

@@ -58,16 +58,11 @@
 static const tran_high_t cospi_30_64 = 1606;
 static const tran_high_t cospi_31_64 = 804;
 
-//  16384 * sqrt(2) * sin(k * Pi / 9) * 2 / 3
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
 static const tran_high_t sinpi_1_9 = 5283;
 static const tran_high_t sinpi_2_9 = 9929;
 static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;
-#if CONFIG_CB4X4
-// 16384 * sqrt(2/5) * 2 * sin(k * Pi / 5)
-static const tran_high_t sinpi_1_5 = 12181;
-static const tran_high_t sinpi_2_5 = 19710;
-#endif
 
 // 16384 * sqrt(2)
 static const tran_high_t Sqrt2 = 23170;

diff --git a/aom_dsp/variance.c b/aom_dsp/variance.c
index 8528f5a..c93f98e 100644
--- a/aom_dsp/variance.c
+++ b/aom_dsp/variance.c

@@ -8,6 +8,7 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+#include <stdlib.h>
 
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
@@ -16,11 +17,7 @@
 #include "aom/aom_integer.h"
 
 #include "aom_dsp/variance.h"
-
-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
+#include "aom_dsp/aom_filter.h"
 
 uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
                             int b_stride) {
@@ -151,7 +148,7 @@
                                      uint32_t *sse) {                \
     int sum;                                                         \
     variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    return *sse - (((int64_t)sum * sum) / (W * H));                  \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
   }
 
 #define SUBPIX_VAR(W, H)                                                \
@@ -162,9 +159,9 @@
     uint8_t temp2[H * W];                                               \
                                                                         \
     var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
+                                      bilinear_filters_2t[xoffset]);    \
     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
+                                       bilinear_filters_2t[yoffset]);   \
                                                                         \
     return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);       \
   }
@@ -179,9 +176,9 @@
     DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
                                                                         \
     var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
+                                      bilinear_filters_2t[xoffset]);    \
     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
+                                       bilinear_filters_2t[yoffset]);   \
                                                                         \
     aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
                                                                         \
@@ -218,7 +215,11 @@
   SUBPIX_VAR(W, H)      \
   SUBPIX_AVG_VAR(W, H)
 
-/* clang-format off */
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 VARIANCES(64, 64)
 VARIANCES(64, 32)
 VARIANCES(32, 64)
@@ -240,7 +241,6 @@
 MSE(16, 8)
 MSE(8, 16)
 MSE(8, 8)
-/* clang-format on */
 
 void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
                          int height, const uint8_t *ref, int ref_stride) {
@@ -258,25 +258,25 @@
 }
 
 // Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(uint8_t *pred, int width, int height,
-                          const uint8_t *ref, const int ref_stride) {
-  const int stride = ref_stride << 3;
+void aom_upsampled_pred_c(uint8_t *comp_pred, int width, int height,
+                          const uint8_t *ref, int ref_stride) {
   int i, j, k;
+  int stride = ref_stride << 3;
 
   for (i = 0; i < height; i++) {
     for (j = 0, k = 0; j < width; j++, k += 8) {
-      pred[j] = ref[k];
+      comp_pred[j] = ref[k];
     }
-    pred += width;
+    comp_pred += width;
     ref += stride;
   }
 }
 
 void aom_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
                                    int width, int height, const uint8_t *ref,
-                                   const int ref_stride) {
-  const int stride = ref_stride << 3;
+                                   int ref_stride) {
   int i, j;
+  int stride = ref_stride << 3;
 
   for (i = 0; i < height; i++) {
     for (j = 0; j < width; j++) {
@@ -354,16 +354,20 @@
                                                const uint8_t *b, int b_stride, \
                                                uint32_t *sse) {                \
     int sum;                                                                   \
+    int64_t var;                                                               \
     highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    return *sse - (((int64_t)sum * sum) / (W * H));                            \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }                                                                            \
                                                                                \
   uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
                                                const uint8_t *b, int b_stride, \
                                                uint32_t *sse) {                \
     int sum;                                                                   \
+    int64_t var;                                                               \
     highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    return *sse - (((int64_t)sum * sum) / (W * H));                            \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
 #define HIGHBD_GET_VAR(S)                                                    \
@@ -410,7 +414,7 @@
     return *sse;                                                              \
   }
 
-static void aom_highbd_var_filter_block2d_bil_first_pass(
+void aom_highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8, uint16_t *output_ptr,
     unsigned int src_pixels_per_line, int pixel_step,
     unsigned int output_height, unsigned int output_width,
@@ -432,7 +436,7 @@
   }
 }
 
-static void aom_highbd_var_filter_block2d_bil_second_pass(
+void aom_highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr, uint16_t *output_ptr,
     unsigned int src_pixels_per_line, unsigned int pixel_step,
     unsigned int output_height, unsigned int output_width,
@@ -452,111 +456,111 @@
   }
 }
 
-#define HIGHBD_SUBPIX_VAR(W, H)                                               \
-  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                      \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                    \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint16_t temp2[H * W];                                                    \
-                                                                              \
-    aom_highbd_var_filter_block2d_bil_first_pass(                             \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);     \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,  \
-                                                  bilinear_filters[yoffset]); \
-                                                                              \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,   \
-                                              dst, dst_stride, sse);          \
-  }                                                                           \
-                                                                              \
-  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                    \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint16_t temp2[H * W];                                                    \
-                                                                              \
-    aom_highbd_var_filter_block2d_bil_first_pass(                             \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);     \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,  \
-                                                  bilinear_filters[yoffset]); \
-                                                                              \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
-                                               dst, dst_stride, sse);         \
-  }                                                                           \
-                                                                              \
-  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                    \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint16_t temp2[H * W];                                                    \
-                                                                              \
-    aom_highbd_var_filter_block2d_bil_first_pass(                             \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);     \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,  \
-                                                  bilinear_filters[yoffset]); \
-                                                                              \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
-                                               dst, dst_stride, sse);         \
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
+                                              dst, dst_stride, sse);         \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
   }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
-  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
-      const uint8_t *second_pred) {                                           \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint16_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
-                                                                              \
-    aom_highbd_var_filter_block2d_bil_first_pass(                             \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);     \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,  \
-                                                  bilinear_filters[yoffset]); \
-                                                                              \
-    aom_highbd_comp_avg_pred(temp3, second_pred, W, H,                        \
-                             CONVERT_TO_BYTEPTR(temp2), W);                   \
-                                                                              \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
-                                              dst, dst_stride, sse);          \
-  }                                                                           \
-                                                                              \
-  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
-      const uint8_t *second_pred) {                                           \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint16_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
-                                                                              \
-    aom_highbd_var_filter_block2d_bil_first_pass(                             \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);     \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,  \
-                                                  bilinear_filters[yoffset]); \
-                                                                              \
-    aom_highbd_comp_avg_pred(temp3, second_pred, W, H,                        \
-                             CONVERT_TO_BYTEPTR(temp2), W);                   \
-                                                                              \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
-                                               dst, dst_stride, sse);         \
-  }                                                                           \
-                                                                              \
-  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
-      const uint8_t *second_pred) {                                           \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint16_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
-                                                                              \
-    aom_highbd_var_filter_block2d_bil_first_pass(                             \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);     \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,  \
-                                                  bilinear_filters[yoffset]); \
-                                                                              \
-    aom_highbd_comp_avg_pred(temp3, second_pred, W, H,                        \
-                             CONVERT_TO_BYTEPTR(temp2), W);                   \
-                                                                              \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
-                                               dst, dst_stride, sse);         \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
+                                                                             \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                              dst, dst_stride, sse);         \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
+                                                                             \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+                                               dst, dst_stride, sse);        \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    aom_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
+                                                                             \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+                                               dst, dst_stride, sse);        \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -565,7 +569,11 @@
   HIGHBD_SUBPIX_VAR(W, H)      \
   HIGHBD_SUBPIX_AVG_VAR(W, H)
 
-/* clang-format off */
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(64, 64)
 HIGHBD_VARIANCES(64, 32)
 HIGHBD_VARIANCES(32, 64)
@@ -587,11 +595,10 @@
 HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
-/* clang-format on */
 
-void aom_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
+void aom_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
   int i, j;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -606,17 +613,17 @@
   }
 }
 
-void aom_highbd_upsampled_pred_c(uint16_t *pred, int width, int height,
-                                 const uint8_t *ref8, const int ref_stride) {
-  const int stride = ref_stride << 3;
+void aom_highbd_upsampled_pred_c(uint16_t *comp_pred, int width, int height,
+                                 const uint8_t *ref8, int ref_stride) {
   int i, j;
+  int stride = ref_stride << 3;
 
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
-      pred[j] = ref[(j << 3)];
+      comp_pred[j] = ref[(j << 3)];
     }
-    pred += width;
+    comp_pred += width;
     ref += stride;
   }
 }
@@ -624,9 +631,9 @@
 void aom_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
                                           const uint8_t *pred8, int width,
                                           int height, const uint8_t *ref8,
-                                          const int ref_stride) {
-  const int stride = ref_stride << 3;
+                                          int ref_stride) {
   int i, j;
+  int stride = ref_stride << 3;
 
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -642,7 +649,300 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-#if CONFIG_MOTION_VAR
+#if CONFIG_AV1 && CONFIG_EXT_INTER
+void masked_variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, const uint8_t *m, int m_stride, int w, int h,
+                     unsigned int *sse, int *sum) {
+  int i, j;
+
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = (a[j] - b[j]) * (m[j]);
+      sum64 += diff;
+      sse64 += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
+}
+
+#define MASK_VAR(W, H)                                                       \
+  unsigned int aom_masked_variance##W##x##H##_c(                             \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }
+
+#define MASK_SUBPIX_VAR(W, H)                                                 \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+                                                                              \
+    var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W,   \
+                                      bilinear_filters_2t[xoffset]);          \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,             \
+                                       bilinear_filters_2t[yoffset]);         \
+                                                                              \
+    return aom_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, msk,   \
+                                            msk_stride, sse);                 \
+  }
+
+MASK_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 4)
+
+MASK_VAR(4, 8)
+MASK_SUBPIX_VAR(4, 8)
+
+MASK_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 4)
+
+MASK_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 8)
+
+MASK_VAR(8, 16)
+MASK_SUBPIX_VAR(8, 16)
+
+MASK_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 8)
+
+MASK_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 16)
+
+MASK_VAR(16, 32)
+MASK_SUBPIX_VAR(16, 32)
+
+MASK_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 16)
+
+MASK_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 32)
+
+MASK_VAR(32, 64)
+MASK_SUBPIX_VAR(32, 64)
+
+MASK_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 32)
+
+MASK_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+MASK_VAR(64, 128)
+MASK_SUBPIX_VAR(64, 128)
+
+MASK_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 64)
+
+MASK_VAR(128, 128)
+MASK_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_AOM_HIGHBITDEPTH
+void highbd_masked_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, const uint8_t *m,
+                              int m_stride, int w, int h, uint64_t *sse,
+                              int64_t *sum) {
+  int i, j;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = (a[j] - b[j]) * (m[j]);
+      *sum += (int64_t)diff;
+      *sse += (int64_t)diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  *sum = (*sum >= 0) ? *sum : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+void highbd_masked_variance(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, const uint8_t *m, int m_stride, int w,
+                            int h, unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+                           &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+void highbd_10_masked_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride,
+                               const uint8_t *m, int m_stride, int w, int h,
+                               unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+                           &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+void highbd_12_masked_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride,
+                               const uint8_t *m, int m_stride, int w, int h,
+                               unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride, w, h,
+                           &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_MASK_VAR(W, H)                                                \
+  unsigned int aom_highbd_masked_variance##W##x##H##_c(                      \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, \
+                           &sum);                                            \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }                                                                          \
+                                                                             \
+  unsigned int aom_highbd_10_masked_variance##W##x##H##_c(                   \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H,   \
+                              sse, &sum);                                    \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }                                                                          \
+                                                                             \
+  unsigned int aom_highbd_12_masked_variance##W##x##H##_c(                   \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,        \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                   \
+    int sum;                                                                 \
+    highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H,   \
+                              sse, &sum);                                    \
+    return *sse - (((int64_t)sum * sum) / (W * H));                          \
+  }
+
+#define HIGHBD_MASK_SUBPIX_VAR(W, H)                                          \
+  unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    return aom_highbd_masked_variance##W##x##H##_c(                           \
+        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+  }                                                                           \
+                                                                              \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(          \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    return aom_highbd_10_masked_variance##W##x##H##_c(                        \
+        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+  }                                                                           \
+                                                                              \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(          \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    return aom_highbd_12_masked_variance##W##x##H##_c(                        \
+        CONVERT_TO_BYTEPTR(temp2), W, dst, dst_stride, msk, msk_stride, sse); \
+  }
+
+HIGHBD_MASK_VAR(4, 4)
+HIGHBD_MASK_SUBPIX_VAR(4, 4)
+
+HIGHBD_MASK_VAR(4, 8)
+HIGHBD_MASK_SUBPIX_VAR(4, 8)
+
+HIGHBD_MASK_VAR(8, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 4)
+
+HIGHBD_MASK_VAR(8, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 8)
+
+HIGHBD_MASK_VAR(8, 16)
+HIGHBD_MASK_SUBPIX_VAR(8, 16)
+
+HIGHBD_MASK_VAR(16, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 8)
+
+HIGHBD_MASK_VAR(16, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 16)
+
+HIGHBD_MASK_VAR(16, 32)
+HIGHBD_MASK_SUBPIX_VAR(16, 32)
+
+HIGHBD_MASK_VAR(32, 16)
+HIGHBD_MASK_SUBPIX_VAR(32, 16)
+
+HIGHBD_MASK_VAR(32, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 32)
+
+HIGHBD_MASK_VAR(32, 64)
+HIGHBD_MASK_SUBPIX_VAR(32, 64)
+
+HIGHBD_MASK_VAR(64, 32)
+HIGHBD_MASK_SUBPIX_VAR(64, 32)
+
+HIGHBD_MASK_VAR(64, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+
+HIGHBD_MASK_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+
+HIGHBD_MASK_VAR(128, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
+
+#if CONFIG_AV1 && CONFIG_MOTION_VAR
 static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask,
                                  int w, int h, unsigned int *sse, int *sum) {
@@ -681,9 +981,9 @@
     uint8_t temp2[H * W];                                                   \
                                                                             \
     var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);           \
+                                      bilinear_filters_2t[xoffset]);        \
     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,           \
-                                       bilinear_filters[yoffset]);          \
+                                       bilinear_filters_2t[yoffset]);       \
                                                                             \
     return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);       \
   }
@@ -727,6 +1027,17 @@
 OBMC_VAR(64, 64)
 OBMC_SUBPIX_VAR(64, 64)
 
+#if CONFIG_EXT_PARTITION
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
 #if CONFIG_AOM_HIGHBITDEPTH
 static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
                                           const int32_t *wsrc,
@@ -817,9 +1128,9 @@
     uint16_t temp2[H * W];                                                     \
                                                                                \
     aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters[xoffset]);      \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,   \
-                                                  bilinear_filters[yoffset]);  \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
                                                                                \
     return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
                                                  wsrc, mask, sse);             \
@@ -832,9 +1143,9 @@
     uint16_t temp2[H * W];                                                     \
                                                                                \
     aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters[xoffset]);      \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,   \
-                                                  bilinear_filters[yoffset]);  \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
                                                                                \
     return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, wsrc, mask, sse);       \
@@ -847,9 +1158,9 @@
     uint16_t temp2[H * W];                                                     \
                                                                                \
     aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters[xoffset]);      \
-    aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,   \
-                                                  bilinear_filters[yoffset]);  \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
                                                                                \
     return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                                     W, wsrc, mask, sse);       \
@@ -893,5 +1204,16 @@
 
 HIGHBD_OBMC_VAR(64, 64)
 HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_AV1 && CONFIG_MOTION_VAR

diff --git a/aom_dsp/variance.h b/aom_dsp/variance.h
index 2e9077a..222fdaf 100644
--- a/aom_dsp/variance.h
+++ b/aom_dsp/variance.h

@@ -24,10 +24,10 @@
 #define FILTER_WEIGHT 128
 
 typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride);
+                                     const uint8_t *b, int b_stride);
 
-typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
-                                         const uint8_t *b_ptr, int b_stride,
+typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
                                          const uint8_t *second_pred);
 
 typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
@@ -50,6 +50,24 @@
                                                 const uint8_t *b, int b_stride,
                                                 unsigned int *sse);
 
+typedef unsigned int (*aom_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred);
+
+#if CONFIG_AV1 && CONFIG_EXT_INTER
+typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *msk_ptr,
+                                            int msk_stride);
+typedef unsigned int (*aom_masked_variance_fn_t)(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *msk, int msk_stride, unsigned int *sse);
+typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse);
+#endif  // CONFIG_AV1 && CONFIG_EXT_INTER
+
 #if CONFIG_AV1 && CONFIG_MOTION_VAR
 typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
                                           const int32_t *wsrc,
@@ -64,10 +82,6 @@
     const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
 #endif  // CONFIG_AV1 && CONFIG_MOTION_VAR
 
-typedef unsigned int (*aom_subp_avg_variance_fn_t)(
-    const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset,
-    const uint8_t *b_ptr, int b_stride, unsigned int *sse,
-    const uint8_t *second_pred);
 #if CONFIG_AV1
 typedef struct aom_variance_vtable {
   aom_sad_fn_t sdf;
@@ -78,6 +92,11 @@
   aom_sad_multi_fn_t sdx3f;
   aom_sad_multi_fn_t sdx8f;
   aom_sad_multi_d_fn_t sdx4df;
+#if CONFIG_EXT_INTER
+  aom_masked_sad_fn_t msdf;
+  aom_masked_variance_fn_t mvf;
+  aom_masked_subpixvariance_fn_t msvf;
+#endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
   aom_obmc_sad_fn_t osdf;
   aom_obmc_variance_fn_t ovf;
@@ -86,6 +105,18 @@
 } aom_variance_fn_ptr_t;
 #endif  // CONFIG_AV1
 
+void aom_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/aom_dsp/x86/add_noise_sse2.asm b/aom_dsp/x86/add_noise_sse2.asm
new file mode 100644
index 0000000..18fc165
--- /dev/null
+++ b/aom_dsp/x86/add_noise_sse2.asm

@@ -0,0 +1,83 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;void aom_plane_add_noise_sse2(unsigned char *start, unsigned char *noise,
+;                              unsigned char blackclamp[16],
+;                              unsigned char whiteclamp[16],
+;                              unsigned char bothclamp[16],
+;                              unsigned int width, unsigned int height,
+;                              int pitch)
+global sym(aom_plane_add_noise_sse2) PRIVATE
+sym(aom_plane_add_noise_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ; get the clamps in registers
+    mov     rdx, arg(2) ; blackclamp
+    movdqu  xmm3, [rdx]
+    mov     rdx, arg(3) ; whiteclamp
+    movdqu  xmm4, [rdx]
+    mov     rdx, arg(4) ; bothclamp
+    movdqu  xmm5, [rdx]
+
+.addnoise_loop:
+    call sym(LIBAOM_RAND) WRT_PLT
+    mov     rcx, arg(1) ;noise
+    and     rax, 0xff
+    add     rcx, rax
+
+    mov     rdi, rcx
+    movsxd  rcx, dword arg(5) ;[Width]
+    mov     rsi, arg(0) ;Pos
+    xor         rax,rax
+
+.addnoise_nextset:
+      movdqu      xmm1,[rsi+rax]         ; get the source
+
+      psubusb     xmm1, xmm3 ; subtract black clamp
+      paddusb     xmm1, xmm5 ; add both clamp
+      psubusb     xmm1, xmm4 ; subtract whiteclamp
+
+      movdqu      xmm2,[rdi+rax]         ; get the noise for this line
+      paddb       xmm1,xmm2              ; add it in
+      movdqu      [rsi+rax],xmm1         ; store the result
+
+      add         rax,16                 ; move to the next line
+
+      cmp         rax, rcx
+      jl          .addnoise_nextset
+
+    movsxd  rax, dword arg(7) ; Pitch
+    add     arg(0), rax ; Start += Pitch
+    sub     dword arg(6), 1   ; Height -= 1
+    jg      .addnoise_loop
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+rd42:
+    times 8 dw 0x04
+four8s:
+    times 4 dd 8

diff --git a/aom_dsp/x86/aom_convolve_copy_sse2.asm b/aom_dsp/x86/aom_convolve_copy_sse2.asm
index ca1fe63..a094f80 100644
--- a/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ b/aom_dsp/x86/aom_convolve_copy_sse2.asm

@@ -49,6 +49,119 @@
   je .w16
   cmp r4d, 32
   je .w32
+
+%if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  cmp r4d, 64
+  je .w64
+%ifidn %2, highbd
+  cmp r4d, 128
+  je .w128
+
+.w256:
+  mov                    r4d, dword hm
+.loop256:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  movu                    m0, [srcq+128]
+  movu                    m1, [srcq+128+16]
+  movu                    m2, [srcq+128+32]
+  movu                    m3, [srcq+128+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq+128]
+  pavg                    m1, [dstq+128+16]
+  pavg                    m2, [dstq+128+32]
+  pavg                    m3, [dstq+128+48]
+%endif
+  mova         [dstq+128   ], m0
+  mova         [dstq+128+16], m1
+  mova         [dstq+128+32], m2
+  mova         [dstq+128+48], m3
+  movu                    m0, [srcq+128+64]
+  movu                    m1, [srcq+128+80]
+  movu                    m2, [srcq+128+96]
+  movu                    m3, [srcq+128+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+128+64]
+  pavg                    m1, [dstq+128+80]
+  pavg                    m2, [dstq+128+96]
+  pavg                    m3, [dstq+128+112]
+%endif
+  mova         [dstq+128+64], m0
+  mova         [dstq+128+80], m1
+  mova         [dstq+128+96], m2
+  mova        [dstq+128+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop256
+  RET
+%endif
+
+.w128:
+  mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop128
+  RET
+
+%else  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
+
 %ifidn %2, highbd
   cmp r4d, 64
   je .w64
@@ -85,12 +198,13 @@
   mova             [dstq+96], m2
   mova            [dstq+112], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop128
   RET
 %endif
+%endif  ; CONFIG_AV1 && CONFIG_EXT_PARTITION
 
-.w64
+.w64:
   mov                    r4d, dword hm
 .loop64:
   movu                    m0, [srcq]
@@ -109,7 +223,7 @@
   mova             [dstq+32], m2
   mova             [dstq+48], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop64
   RET
 

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
index 59a2196..1449dd8 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c

@@ -37,9 +37,10 @@
 };
 
 #if defined(__clang__)
-#if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) ||  \
-    (defined(__APPLE__) && ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
-                            (__clang_major__ == 5 && __clang_minor__ == 0)))
+#if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+    (defined(__APPLE__) && defined(__apple_build_version__) &&               \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||                      \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
 
 #define MM256_BROADCASTSI128_SI256(x) \
   _mm_broadcastsi128_si256((__m128i const *)&(x))

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 38c33af..04a06e5 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c

@@ -830,34 +830,37 @@
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
+                            x_step_q4, w, intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
+                            x_step_q4, w, intermediate_height);
   }
 
   if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+                            y_step_q4, w, h);
   } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+                           y_step_q4, w, h);
   } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
+                           y_step_q4, w, h);
   }
 }
 

diff --git a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index c7f4eeb..91febbc 100644
--- a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm

@@ -19,13 +19,14 @@
 ; %define USE_PMULHRSW
 ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
 ; when using this instruction.
+;
+; The add order below (based on ffav1) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
 
 SECTION .text
-%if ARCH_X86_64
-  %define LOCAL_VARS_SIZE 16*4
-%else
-  %define LOCAL_VARS_SIZE 16*6
-%endif
+%define LOCAL_VARS_SIZE 16*6
 
 %macro SETUP_LOCAL_VARS 0
     ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
@@ -52,11 +53,11 @@
     mova       k6k7, m3
 %if ARCH_X86_64
     %define     krd  m12
-    %define     tmp  m13
+    %define    tmp0  [rsp + 16*4]
+    %define    tmp1  [rsp + 16*5]
     mova        krd, [GLOBAL(pw_64)]
 %else
-    %define     tmp  [rsp + 16*4]
-    %define     krd  [rsp + 16*5]
+    %define     krd  [rsp + 16*4]
 %if CONFIG_PIC=0
     mova         m6, [GLOBAL(pw_64)]
 %else
@@ -69,55 +70,31 @@
 %endif
 %endm
 
-%macro HORIZx4_ROW 2
-    mova      %2, %1
-    punpcklbw %1, %1
-    punpckhbw %2, %2
-
-    mova      m3, %2
-    palignr   %2, %1, 1
-    palignr   m3, %1, 5
-
-    pmaddubsw %2, k0k1k4k5
-    pmaddubsw m3, k2k3k6k7
-
-    mova      m4, %2
-    mova      m5, m3
-    psrldq    %2, 8
-    psrldq    m3, 8
-    mova      m6, m5
-
-    paddsw    m4, m3
-    pmaxsw    m5, %2
-    pminsw    %2, m6
-    paddsw    %2, m4
-    paddsw    %2, m5
-    paddsw    %2, krd
-    psraw     %2, 7
-    packuswb  %2, %2
-%endm
-
 ;-------------------------------------------------------------------------------
+%if ARCH_X86_64
+  %define LOCAL_VARS_SIZE_H4 0
+%else
+  %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
 %macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
                             src, sstride, dst, dstride, height, filter
     mova                m4, [filterq]
     packsswb            m4, m4
 %if ARCH_X86_64
-    %define       k0k1k4k5 m8
-    %define       k2k3k6k7 m9
-    %define            krd m10
-    %define    orig_height r7d
+    %define       k0k1k4k5  m8
+    %define       k2k3k6k7  m9
+    %define            krd  m10
     mova               krd, [GLOBAL(pw_64)]
     pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
     pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
     pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
     pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
 %else
-    %define       k0k1k4k5 [rsp + 16*0]
-    %define       k2k3k6k7 [rsp + 16*1]
-    %define            krd [rsp + 16*2]
-    %define    orig_height [rsp + 16*3]
+    %define       k0k1k4k5  [rsp + 16*0]
+    %define       k2k3k6k7  [rsp + 16*1]
+    %define            krd  [rsp + 16*2]
     pshuflw             m6, m4, 0b              ;k0_k1
     pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
     pshuflw             m7, m4, 01010101b       ;k2_k3
@@ -134,68 +111,46 @@
     mova          k2k3k6k7, m7
     mova               krd, m1
 %endif
-    mov        orig_height, heightd
-    shr            heightd, 1
+    dec            heightd
+
 .loop:
     ;Do two rows at once
-    movh                m0, [srcq - 3]
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-    mova                m1, m0
-    movh                m2, [srcq + sstrideq - 3]
-    movh                m3, [srcq + sstrideq + 5]
-    punpcklqdq          m2, m3
-    mova                m3, m2
-    punpcklbw           m0, m0
-    punpckhbw           m1, m1
-    punpcklbw           m2, m2
-    punpckhbw           m3, m3
-    mova                m4, m1
-    palignr             m4, m0,  1
-    pmaddubsw           m4, k0k1k4k5
-    palignr             m1, m0,  5
+    movu                m4, [srcq - 3]
+    movu                m5, [srcq + sstrideq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    punpckhbw           m3, m5, m5
+    punpcklbw           m5, m5
+    palignr             m0, m1, m4, 1
+    pmaddubsw           m0, k0k1k4k5
+    palignr             m1, m4, 5
     pmaddubsw           m1, k2k3k6k7
-    mova                m7, m3
-    palignr             m7, m2,  1
-    pmaddubsw           m7, k0k1k4k5
-    palignr             m3, m2,  5
+    palignr             m2, m3, m5, 1
+    pmaddubsw           m2, k0k1k4k5
+    palignr             m3, m5, 5
     pmaddubsw           m3, k2k3k6k7
-    mova                m0, m4
-    mova                m5, m1
-    mova                m2, m7
-    psrldq              m4, 8
-    psrldq              m1, 8
-    mova                m6, m5
+    punpckhqdq          m4, m0, m2
+    punpcklqdq          m0, m2
+    punpckhqdq          m5, m1, m3
+    punpcklqdq          m1, m3
+    paddsw              m0, m4
+    paddsw              m1, m5
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    movd                m5, [dstq + dstrideq]
+%endif
     paddsw              m0, m1
-    mova                m1, m3
-    psrldq              m7, 8
-    psrldq              m3, 8
-    paddsw              m2, m3
-    mova                m3, m1
-    pmaxsw              m5, m4
-    pminsw              m4, m6
-    paddsw              m4, m0
-    paddsw              m4, m5
-    pmaxsw              m1, m7
-    pminsw              m7, m3
-    paddsw              m7, m2
-    paddsw              m7, m1
-
-    paddsw              m4, krd
-    psraw               m4, 7
-    packuswb            m4, m4
-    paddsw              m7, krd
-    psraw               m7, 7
-    packuswb            m7, m7
+    paddsw              m0, krd
+    psraw               m0, 7
+    packuswb            m0, m0
+    psrldq              m1, m0, 4
 
 %ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m4, m0
-    movd                m2, [dstq + dstrideq]
-    pavgb               m7, m2
+    pavgb               m0, m4
+    pavgb               m1, m5
 %endif
-    movd            [dstq], m4
-    movd [dstq + dstrideq], m7
+    movd            [dstq], m0
+    movd [dstq + dstrideq], m1
 
     lea               srcq, [srcq + sstrideq        ]
     prefetcht0              [srcq + 4 * sstrideq - 3]
@@ -203,236 +158,175 @@
     lea               dstq, [dstq + 2 * dstrideq    ]
     prefetcht0              [srcq + 2 * sstrideq - 3]
 
-    dec            heightd
-    jnz              .loop
+    sub            heightd, 2
+    jg               .loop
 
     ; Do last row if output_height is odd
-    mov            heightd, orig_height
-    and            heightd, 1
-    je               .done
+    jne              .done
 
-    movh                m0, [srcq - 3]    ; load src
-    movh                m1, [srcq + 5]
-    punpcklqdq          m0, m1
-
-    HORIZx4_ROW         m0, m1
+    movu                m4, [srcq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    palignr             m0, m1, m4, 1
+    palignr             m1, m4, 5
+    pmaddubsw           m0, k0k1k4k5
+    pmaddubsw           m1, k2k3k6k7
+    psrldq              m2, m0, 8
+    psrldq              m3, m1, 8
+    paddsw              m0, m2
+    paddsw              m1, m3
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+    packuswb            m0, m0
 %ifidn %1, h8_avg
-    movd                m0, [dstq]
-    pavgb               m1, m0
+    movd                m4, [dstq]
+    pavgb               m0, m4
 %endif
-    movd            [dstq], m1
-.done
-    RET
-%endm
-
-%macro HORIZx8_ROW 5
-    mova        %2, %1
-    punpcklbw   %1, %1
-    punpckhbw   %2, %2
-
-    mova        %3, %2
-    mova        %4, %2
-    mova        %5, %2
-
-    palignr     %2, %1, 1
-    palignr     %3, %1, 5
-    palignr     %4, %1, 9
-    palignr     %5, %1, 13
-
-    pmaddubsw   %2, k0k1
-    pmaddubsw   %3, k2k3
-    pmaddubsw   %4, k4k5
-    pmaddubsw   %5, k6k7
-
-    paddsw      %2, %5
-    mova        %1, %3
-    pminsw      %3, %4
-    pmaxsw      %1, %4
-    paddsw      %2, %3
-    paddsw      %1, %2
-    paddsw      %1, krd
-    psraw       %1, 7
-    packuswb    %1, %1
+    movd            [dstq], m0
+.done:
+    REP_RET
 %endm
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
                             src, sstride, dst, dstride, height, filter
     mova                 m4, [filterq]
     SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define     orig_height r7d
-%else
-    %define     orig_height heightmp
-%endif
-    mov         orig_height, heightd
-    shr             heightd, 1
+    dec             heightd
 
 .loop:
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    movh                 m4, [srcq + sstrideq - 3]
-    movh                 m7, [srcq + sstrideq + 5]
-    punpcklqdq           m0, m3
-    mova                 m1, m0
+    ;Do two rows at once
+    movu                 m0, [srcq - 3]
+    movu                 m4, [srcq + sstrideq - 3]
+    punpckhbw            m1, m0, m0
     punpcklbw            m0, m0
-    punpckhbw            m1, m1
-    mova                 m5, m1
-    palignr              m5, m0, 13
+    palignr              m5, m1, m0, 13
     pmaddubsw            m5, k6k7
-    mova                 m2, m1
-    mova                 m3, m1
+    palignr              m2, m1, m0, 5
+    palignr              m3, m1, m0, 9
     palignr              m1, m0, 1
     pmaddubsw            m1, k0k1
-    punpcklqdq           m4, m7
-    mova                 m6, m4
+    punpckhbw            m6, m4, m4
     punpcklbw            m4, m4
-    palignr              m2, m0, 5
-    punpckhbw            m6, m6
-    palignr              m3, m0, 9
-    mova                 m7, m6
     pmaddubsw            m2, k2k3
     pmaddubsw            m3, k4k5
 
-    palignr              m7, m4, 13
-    paddsw               m1, m5
-    mova                 m5, m6
-    mova                 m0, m2
-    palignr              m5, m4, 5
-    pminsw               m2, m3
+    palignr              m7, m6, m4, 13
+    palignr              m0, m6, m4, 5
     pmaddubsw            m7, k6k7
-    pmaxsw               m3, m0
-    paddsw               m1, m2
-    mova                 m0, m6
-    palignr              m6, m4, 1
-    pmaddubsw            m5, k2k3
     paddsw               m1, m3
+    paddsw               m2, m5
+    paddsw               m1, m2
+%ifidn %1, h8_avg
+    movh                 m2, [dstq]
+    movhps               m2, [dstq + dstrideq]
+%endif
+    palignr              m5, m6, m4, 9
+    palignr              m6, m4, 1
+    pmaddubsw            m0, k2k3
     pmaddubsw            m6, k0k1
-    palignr              m0, m4, 9
     paddsw               m1, krd
-    pmaddubsw            m0, k4k5
-    mova                 m4, m5
+    pmaddubsw            m5, k4k5
     psraw                m1, 7
-    pminsw               m5, m0
-    paddsw               m6, m7
-    packuswb             m1, m1
-
+    paddsw               m0, m7
     paddsw               m6, m5
-    pmaxsw               m0, m4
     paddsw               m6, m0
     paddsw               m6, krd
     psraw                m6, 7
-    packuswb             m6, m6
-
+    packuswb             m1, m6
 %ifidn %1, h8_avg
-    movh                 m0, [dstq]
-    movh                 m2, [dstq + dstrideq]
-    pavgb                m1, m0
-    pavgb                m6, m2
+    pavgb                m1, m2
 %endif
-    movh             [dstq], m1
-    movh  [dstq + dstrideq], m6
+    movh              [dstq], m1
+    movhps [dstq + dstrideq], m1
 
     lea                srcq, [srcq + sstrideq        ]
     prefetcht0               [srcq + 4 * sstrideq - 3]
     lea                srcq, [srcq + sstrideq        ]
     lea                dstq, [dstq + 2 * dstrideq    ]
     prefetcht0               [srcq + 2 * sstrideq - 3]
-    dec             heightd
-    jnz             .loop
+    sub             heightd, 2
+    jg                .loop
 
-    ;Do last row if output_height is odd
-    mov             heightd, orig_height
-    and             heightd, 1
-    je                .done
+    ; Do last row if output_height is odd
+    jne               .done
 
-    movh                 m0, [srcq - 3]
-    movh                 m3, [srcq + 5]
-    punpcklqdq           m0, m3
-
-    HORIZx8_ROW          m0, m1, m2, m3, m4
-
+    movu                 m0, [srcq - 3]
+    punpckhbw            m3, m0, m0
+    punpcklbw            m0, m0
+    palignr              m1, m3, m0, 1
+    palignr              m2, m3, m0, 5
+    palignr              m4, m3, m0, 13
+    palignr              m3, m0, 9
+    pmaddubsw            m1, k0k1
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+    pmaddubsw            m4, k6k7
+    paddsw               m1, m3
+    paddsw               m4, m2
+    paddsw               m1, m4
+    paddsw               m1, krd
+    psraw                m1, 7
+    packuswb             m1, m1
 %ifidn %1, h8_avg
-    movh                 m1, [dstq]
-    pavgb                m0, m1
+    movh                 m0, [dstq]
+    pavgb                m1, m0
 %endif
-    movh             [dstq], m0
+    movh             [dstq], m1
 .done:
-    RET
+    REP_RET
 %endm
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
+
 .loop:
     prefetcht0        [srcq + 2 * sstrideq -3]
 
-    movh          m0, [srcq -  3]
-    movh          m4, [srcq +  5]
-    movh          m6, [srcq + 13]
-    punpcklqdq    m0, m4
-    mova          m7, m0
-    punpckhbw     m0, m0
-    mova          m1, m0
-    punpcklqdq    m4, m6
-    mova          m3, m0
-    punpcklbw     m7, m7
-
-    palignr       m3, m7, 13
-    mova          m2, m0
-    pmaddubsw     m3, k6k7
-    palignr       m0, m7, 1
+    movu          m0, [srcq - 3]
+    movu          m4, [srcq - 2]
     pmaddubsw     m0, k0k1
-    palignr       m1, m7, 5
-    pmaddubsw     m1, k2k3
-    palignr       m2, m7, 9
-    pmaddubsw     m2, k4k5
-    paddsw        m0, m3
-    mova          m3, m4
-    punpckhbw     m4, m4
-    mova          m5, m4
-    punpcklbw     m3, m3
-    mova          m7, m4
-    palignr       m5, m3, 5
-    mova          m6, m4
-    palignr       m4, m3, 1
     pmaddubsw     m4, k0k1
+    movu          m1, [srcq - 1]
+    movu          m5, [srcq + 0]
+    pmaddubsw     m1, k2k3
     pmaddubsw     m5, k2k3
-    palignr       m6, m3, 9
+    movu          m2, [srcq + 1]
+    movu          m6, [srcq + 2]
+    pmaddubsw     m2, k4k5
     pmaddubsw     m6, k4k5
-    palignr       m7, m3, 13
+    movu          m3, [srcq + 3]
+    movu          m7, [srcq + 4]
+    pmaddubsw     m3, k6k7
     pmaddubsw     m7, k6k7
-
-    mova          m3, m1
-    pmaxsw        m1, m2
-    pminsw        m2, m3
     paddsw        m0, m2
+    paddsw        m1, m3
     paddsw        m0, m1
-    paddsw        m4, m7
-    mova          m7, m5
-    pmaxsw        m5, m6
-    pminsw        m6, m7
     paddsw        m4, m6
+    paddsw        m5, m7
     paddsw        m4, m5
     paddsw        m0, krd
     paddsw        m4, krd
     psraw         m0, 7
     psraw         m4, 7
-    packuswb      m0, m4
+    packuswb      m0, m0
+    packuswb      m4, m4
+    punpcklbw     m0, m4
 %ifidn %1, h8_avg
-    mova          m1, [dstq]
-    pavgb         m0, m1
+    pavgb         m0, [dstq]
 %endif
     lea         srcq, [srcq + sstrideq]
     mova      [dstq], m0
     lea         dstq, [dstq + dstrideq]
     dec      heightd
     jnz        .loop
-    RET
+    REP_RET
 %endm
 
 INIT_XMM ssse3
@@ -444,223 +338,463 @@
 SUBPIX_HFILTER4  h8_avg
 
 ;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+    %define NUM_GENERAL_REG_USED 9
+%else
+    %define NUM_GENERAL_REG_USED 6
+%endif
+
 %macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
     mova          m4, [filterq]
     SETUP_LOCAL_VARS
-%if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
-%else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
-%endif
-    mov       src1q, srcq
-    add       src1q, sstrideq
-    lea   sstride6q, [sstrideq + sstrideq * 4]
-    add   sstride6q, sstrideq                   ;pitch * 6
 
 %ifidn %2, 8
-    %define movx movh
+    %define                movx  movh
 %else
-    %define movx movd
+    %define                movx  movd
 %endif
+
+    dec                 heightd
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
+%else
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
+%endif
+    mov                   src1q, srcq
+    add                   src1q, sstrideq
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
 .loop:
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    punpcklbw    m0, m1                         ;A B
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    pmaddubsw    m0, k0k1
-    mova         m6, m2
-    movx         m3, [src1q + sstrideq * 2]     ;D
-    punpcklbw    m2, m3                         ;C D
-    pmaddubsw    m2, k2k3
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    mova         m7, m4
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m4, k4k5
-    punpcklbw    m1, m6                         ;A B next iter
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m5, m6                         ;E F next iter
-    punpcklbw    m3, m7                         ;C D next iter
-    pmaddubsw    m5, k4k5
-    movx         m7, [src1q + sstride6q   ]     ;H
-    punpcklbw    m6, m7                         ;G H
-    pmaddubsw    m6, k6k7
-    mova        tmp, m2
-    pmaddubsw    m3, k2k3
-    pmaddubsw    m1, k0k1
-    pmaxsw       m2, m4
-    paddsw       m0, m6
-    movx         m6, [srcq + sstrideq * 8 ]     ;H next iter
-    punpcklbw    m7, m6
-    pmaddubsw    m7, k6k7
-    pminsw       m4, tmp
-    paddsw       m0, m4
-    mova         m4, m3
-    paddsw       m0, m2
-    pminsw       m3, m5
-    pmaxsw       m5, m4
-    paddsw       m0, krd
-    psraw        m0, 7
-    paddsw       m1, m7
-    packuswb     m0, m0
+    ;Do two rows at once
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [src1q               ]     ;B
+    punpcklbw                m0, m1                         ;A B
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    pmaddubsw                m0, k0k1
+    mova                     m6, m2
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    punpcklbw                m2, m3                         ;C D
+    pmaddubsw                m2, k2k3
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    mova                     m7, m4
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m4, k4k5
+    punpcklbw                m1, m6                         ;A B next iter
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m5, m6                         ;E F next iter
+    punpcklbw                m3, m7                         ;C D next iter
+    pmaddubsw                m5, k4k5
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m3, k2k3
+    pmaddubsw                m1, k0k1
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
+    punpcklbw                m7, m6
+    pmaddubsw                m7, k6k7
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    paddsw                   m1, m5
+    packuswb                 m0, m0
 
-    paddsw       m1, m3
-    paddsw       m1, m5
-    paddsw       m1, krd
-    psraw        m1, 7
-    lea        srcq, [srcq + sstrideq * 2 ]
-    lea       src1q, [src1q + sstrideq * 2]
-    packuswb     m1, m1
+    paddsw                   m3, m7
+    paddsw                   m1, m3
+    paddsw                   m1, krd
+    psraw                    m1, 7
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    lea                   src1q, [src1q + sstrideq * 2]
+    packuswb                 m1, m1
 
 %ifidn %1, v8_avg
-    movx         m2, [dstq]
-    pavgb        m0, m2
+    movx                     m2, [dstq]
+    pavgb                    m0, m2
 %endif
-    movx     [dstq], m0
-    add        dstq, dst_stride
+    movx                 [dstq], m0
+    add                    dstq, dst_stride
 %ifidn %1, v8_avg
-    movx         m3, [dstq]
-    pavgb        m1, m3
+    movx                     m3, [dstq]
+    pavgb                    m1, m3
 %endif
-    movx     [dstq], m1
-    add        dstq, dst_stride
-    sub     heightd, 2
-    cmp     heightd, 1
-    jg        .loop
+    movx                 [dstq], m1
+    add                    dstq, dst_stride
+    sub                 heightd, 2
+    jg                    .loop
 
-    cmp     heightd, 0
-    je        .done
+    ; Do last row if output_height is odd
+    jne                   .done
 
-    movx         m0, [srcq                ]     ;A
-    movx         m1, [srcq + sstrideq     ]     ;B
-    movx         m6, [srcq + sstride6q    ]     ;G
-    punpcklbw    m0, m1                         ;A B
-    movx         m7, [rax + sstride6q     ]     ;H
-    pmaddubsw    m0, k0k1
-    movx         m2, [srcq + sstrideq * 2 ]     ;C
-    punpcklbw    m6, m7                         ;G H
-    movx         m3, [rax + sstrideq * 2  ]     ;D
-    pmaddubsw    m6, k6k7
-    movx         m4, [srcq + sstrideq * 4 ]     ;E
-    punpcklbw    m2, m3                         ;C D
-    movx         m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw    m4, m5                         ;E F
-    pmaddubsw    m2, k2k3
-    pmaddubsw    m4, k4k5
-    paddsw       m0, m6
-    mova         m1, m2
-    pmaxsw       m2, m4
-    pminsw       m4, m1
-    paddsw       m0, m4
-    paddsw       m0, m2
-    paddsw       m0, krd
-    psraw        m0, 7
-    packuswb     m0, m0
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m0, m1                         ;A B
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    pmaddubsw                m0, k0k1
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    punpcklbw                m6, m7                         ;G H
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    pmaddubsw                m6, k6k7
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    punpcklbw                m2, m3                         ;C D
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    paddsw                   m2, m6
+    paddsw                   m0, m4
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    packuswb                 m0, m0
 %ifidn %1, v8_avg
-    movx         m1, [dstq]
-    pavgb        m0, m1
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
 %endif
-    movx     [dstq], m0
+    movx                 [dstq], m0
+
+%else
+    ; ARCH_X86_64
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m2, [srcq]                     ;C
+    movx                     m3, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m4, [srcq]                     ;E
+    movx                     m5, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m6, [srcq]                     ;G
+    punpcklbw                m0, m1                         ;A B
+    punpcklbw                m1, m2                         ;A B next iter
+    punpcklbw                m2, m3                         ;C D
+    punpcklbw                m3, m4                         ;C D next iter
+    punpcklbw                m4, m5                         ;E F
+    punpcklbw                m5, m6                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    movx                     m7, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                    m14, [srcq]                     ;H next iter
+    punpcklbw                m6, m7                         ;G H
+    punpcklbw                m7, m14                        ;G H next iter
+    pmaddubsw                m8, m0, k0k1
+    pmaddubsw                m9, m1, k0k1
+    mova                     m0, m2
+    mova                     m1, m3
+    pmaddubsw               m10, m2, k2k3
+    pmaddubsw               m11, m3, k2k3
+    mova                     m2, m4
+    mova                     m3, m5
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m5, k4k5
+    paddsw                   m8, m4
+    paddsw                   m9, m5
+    mova                     m4, m6
+    mova                     m5, m7
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m7, k6k7
+    paddsw                  m10, m6
+    paddsw                  m11, m7
+    paddsw                   m8, m10
+    paddsw                   m9, m11
+    mova                     m6, m14
+    paddsw                   m8, krd
+    paddsw                   m9, krd
+    psraw                    m8, 7
+    psraw                    m9, 7
+%ifidn %2, 4
+    packuswb                 m8, m8
+    packuswb                 m9, m9
+%else
+    packuswb                 m8, m9
+%endif
+
+%ifidn %1, v8_avg
+    movx                     m7, [dstq]
+%ifidn %2, 4
+    movx                    m10, [dstq + dstrideq]
+    pavgb                    m9, m10
+%else
+    movhpd                   m7, [dstq + dstrideq]
+%endif
+    pavgb                    m8, m7
+%endif
+    movx                 [dstq], m8
+%ifidn %2, 4
+    movx      [dstq + dstrideq], m9
+%else
+    movhpd    [dstq + dstrideq], m8
+%endif
+
+    lea                    dstq, [dstq + dstrideq * 2 ]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m7, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m6, k6k7
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    packuswb                 m0, m0
+%ifidn %1, v8_avg
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
+%endif
+    movx                 [dstq], m0
+
+%endif ; ARCH_X86_64
+
 .done:
-    RET
+    REP_RET
+
 %endm
 
 ;-------------------------------------------------------------------------------
 %macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
                              src, sstride, dst, dstride, height, filter
-
-    mova          m4, [filterq]
+    mova                     m4, [filterq]
     SETUP_LOCAL_VARS
+
+%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
 %if ARCH_X86_64
-    %define      src1q r7
-    %define  sstride6q r8
-    %define dst_stride dstrideq
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
 %else
-    %define      src1q filterq
-    %define  sstride6q dstrideq
-    %define dst_stride dstridemp
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
 %endif
-    mov        src1q, srcq
-    add        src1q, sstrideq
-    lea    sstride6q, [sstrideq + sstrideq * 4]
-    add    sstride6q, sstrideq                   ;pitch * 6
+    lea                   src1q, [srcq + sstrideq]
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
 
 .loop:
-    movh          m0, [srcq                ]     ;A
-    movh          m1, [srcq + sstrideq     ]     ;B
-    movh          m2, [srcq + sstrideq * 2 ]     ;C
-    movh          m3, [src1q + sstrideq * 2]     ;D
-    movh          m4, [srcq + sstrideq * 4 ]     ;E
-    movh          m5, [src1q + sstrideq * 4]     ;F
+    movh                     m0, [srcq                ]     ;A
+    movh                     m1, [src1q               ]     ;B
+    movh                     m2, [srcq + sstrideq * 2 ]     ;C
+    movh                     m3, [src1q + sstrideq * 2]     ;D
+    movh                     m4, [srcq + sstrideq * 4 ]     ;E
+    movh                     m5, [src1q + sstrideq * 4]     ;F
 
-    punpcklbw     m0, m1                         ;A B
-    movh          m6, [srcq + sstride6q]         ;G
-    punpcklbw     m2, m3                         ;C D
-    movh          m7, [src1q + sstride6q]        ;H
-    punpcklbw     m4, m5                         ;E F
-    pmaddubsw     m0, k0k1
-    movh          m3, [srcq + 8]                 ;A
-    pmaddubsw     m2, k2k3
-    punpcklbw     m6, m7                         ;G H
-    movh          m5, [srcq + sstrideq + 8]      ;B
-    pmaddubsw     m4, k4k5
-    punpcklbw     m3, m5                         ;A B
-    movh          m7, [srcq + sstrideq * 2 + 8]  ;C
-    pmaddubsw     m6, k6k7
-    mova          m1, m2
-    movh          m5, [src1q + sstrideq * 2 + 8] ;D
-    pmaxsw        m2, m4
-    punpcklbw     m7, m5                         ;C D
-    pminsw        m4, m1
-    paddsw        m0, m6
-    pmaddubsw     m3, k0k1
-    movh          m1, [srcq + sstrideq * 4 + 8]  ;E
-    paddsw        m0, m4
-    pmaddubsw     m7, k2k3
-    movh          m6, [src1q + sstrideq * 4 + 8] ;F
-    punpcklbw     m1, m6                         ;E F
-    paddsw        m0, m2
-    paddsw        m0, krd
-    movh          m2, [srcq + sstride6q + 8]     ;G
-    pmaddubsw     m1, k4k5
-    movh          m5, [src1q + sstride6q + 8]    ;H
-    psraw         m0, 7
-    punpcklbw     m2, m5                         ;G H
-    packuswb      m0, m0
-    pmaddubsw     m2, k6k7
-%ifidn %1, v8_avg
-    movh          m4, [dstq]
-    pavgb         m0, m4
-%endif
-    movh      [dstq], m0
-    mova          m6, m7
-    pmaxsw        m7, m1
-    pminsw        m1, m6
-    paddsw        m3, m2
-    paddsw        m3, m1
-    paddsw        m3, m7
-    paddsw        m3, krd
-    psraw         m3, 7
-    packuswb      m3, m3
+    punpcklbw                m0, m1                         ;A B
+    movh                     m6, [srcq + sstride6q]         ;G
+    punpcklbw                m2, m3                         ;C D
+    movh                     m7, [src1q + sstride6q]        ;H
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m0, k0k1
+    movh                     m3, [srcq + 8]                 ;A
+    pmaddubsw                m2, k2k3
+    punpcklbw                m6, m7                         ;G H
+    movh                     m5, [srcq + sstrideq + 8]      ;B
+    pmaddubsw                m4, k4k5
+    punpcklbw                m3, m5                         ;A B
+    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
+    pmaddubsw                m6, k6k7
+    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
+    punpcklbw                m7, m5                         ;C D
+    paddsw                   m2, m6
+    pmaddubsw                m3, k0k1
+    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
+    paddsw                   m0, m4
+    pmaddubsw                m7, k2k3
+    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
+    punpcklbw                m1, m6                         ;E F
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    movh                     m2, [srcq + sstride6q + 8]     ;G
+    pmaddubsw                m1, k4k5
+    movh                     m5, [src1q + sstride6q + 8]    ;H
+    psraw                    m0, 7
+    punpcklbw                m2, m5                         ;G H
+    pmaddubsw                m2, k6k7
+    paddsw                   m7, m2
+    paddsw                   m3, m1
+    paddsw                   m3, m7
+    paddsw                   m3, krd
+    psraw                    m3, 7
+    packuswb                 m0, m3
 
-    add         srcq, sstrideq
-    add        src1q, sstrideq
+    add                    srcq, sstrideq
+    add                   src1q, sstrideq
 %ifidn %1, v8_avg
-    movh          m1, [dstq + 8]
-    pavgb         m3, m1
+    pavgb                    m0, [dstq]
 %endif
-    movh  [dstq + 8], m3
-    add         dstq, dst_stride
-    dec      heightd
-    jnz        .loop
-    RET
+    mova                 [dstq], m0
+    add                    dstq, dst_stride
+    dec                 heightd
+    jnz                   .loop
+    REP_RET
+
+%else
+    ; ARCH_X86_64
+    dec                 heightd
+
+    movu                     m1, [srcq                ]     ;A
+    movu                     m3, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m0, m1, m3                     ;A B
+    punpckhbw                m1, m3                         ;A B
+    movu                     m5, [srcq]                     ;C
+    punpcklbw                m2, m3, m5                     ;A B next iter
+    punpckhbw                m3, m5                         ;A B next iter
+    mova                   tmp0, m2                         ;store to stack
+    mova                   tmp1, m3                         ;store to stack
+    movu                     m7, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m4, m5, m7                     ;C D
+    punpckhbw                m5, m7                         ;C D
+    movu                     m9, [srcq]                     ;E
+    punpcklbw                m6, m7, m9                     ;C D next iter
+    punpckhbw                m7, m9                         ;C D next iter
+    movu                    m11, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m8, m9, m11                    ;E F
+    punpckhbw                m9, m11                        ;E F
+    movu                     m2, [srcq]                     ;G
+    punpcklbw               m10, m11, m2                    ;E F next iter
+    punpckhbw               m11, m2                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    pmaddubsw               m13, m0, k0k1
+    mova                     m0, m4
+    pmaddubsw               m14, m8, k4k5
+    pmaddubsw               m15, m4, k2k3
+    mova                     m4, m8
+    paddsw                  m13, m14
+    movu                     m3, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw               m14, m2, m3                     ;G H
+    mova                     m8, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m15, m14
+    paddsw                  m13, m15
+    paddsw                  m13, krd
+    psraw                   m13, 7
+
+    pmaddubsw               m14, m1, k0k1
+    pmaddubsw                m1, m9, k4k5
+    pmaddubsw               m15, m5, k2k3
+    paddsw                  m14, m1
+    mova                     m1, m5
+    mova                     m5, m9
+    punpckhbw                m2, m3                         ;G H
+    mova                     m9, m2
+    pmaddubsw                m2, k6k7
+    paddsw                  m15, m2
+    paddsw                  m14, m15
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m13, m14
+%ifidn %1, v8_avg
+    pavgb                   m13, [dstq]
+%endif
+    mova                 [dstq], m13
+
+    ; next iter
+    pmaddubsw               m15, tmp0, k0k1
+    pmaddubsw               m14, m10, k4k5
+    pmaddubsw               m13, m6, k2k3
+    paddsw                  m15, m14
+    mova                   tmp0, m6
+    mova                     m6, m10
+    movu                     m2, [srcq]                     ;G next iter
+    punpcklbw               m14, m3, m2                     ;G H next iter
+    mova                    m10, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m13, m14
+    paddsw                  m15, m13
+    paddsw                  m15, krd
+    psraw                   m15, 7
+
+    pmaddubsw               m14, tmp1, k0k1
+    mova                   tmp1, m7
+    pmaddubsw               m13, m7, k2k3
+    mova                     m7, m11
+    pmaddubsw               m11, k4k5
+    paddsw                  m14, m11
+    punpckhbw                m3, m2                         ;G H next iter
+    mova                    m11, m3
+    pmaddubsw                m3, k6k7
+    paddsw                  m13, m3
+    paddsw                  m14, m13
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m15, m14
+%ifidn %1, v8_avg
+    pavgb                   m15, [dstq + dstrideq]
+%endif
+    mova      [dstq + dstrideq], m15
+    lea                    dstq, [dstq + dstrideq * 2]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movu                     m3, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m2, m3                     ;G H
+    punpckhbw                m2, m3                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m1, k0k1
+    pmaddubsw                m4, k2k3
+    pmaddubsw                m5, k2k3
+    pmaddubsw                m8, k4k5
+    pmaddubsw                m9, k4k5
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m2, k6k7
+    paddsw                   m0, m8
+    paddsw                   m1, m9
+    paddsw                   m4, m6
+    paddsw                   m5, m2
+    paddsw                   m0, m4
+    paddsw                   m1, m5
+    paddsw                   m0, krd
+    paddsw                   m1, krd
+    psraw                    m0, 7
+    psraw                    m1, 7
+    packuswb                 m0, m1
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+
+.done:
+    REP_RET
+
+%endif ; ARCH_X86_64
+
 %endm
 
 INIT_XMM ssse3

diff --git a/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
index d9dc97d..b9b2da0 100644
--- a/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm

@@ -17,14 +17,14 @@
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
+    mov         ecx, 0x01000100
 
     movdqa      xmm3, [rdx]                 ;load filters
     psrldq      xmm3, 6
     packsswb    xmm3, xmm3
     pshuflw     xmm3, xmm3, 0b              ;k3_k4
 
-    movq        xmm2, rcx                   ;rounding
+    movd        xmm2, ecx                   ;rounding_shift
     pshufd      xmm2, xmm2, 0
 
     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -36,8 +36,7 @@
     punpcklbw   xmm0, xmm1
     pmaddubsw   xmm0, xmm3
 
-    paddsw      xmm0, xmm2                  ;rounding
-    psraw       xmm0, 7                     ;shift
+    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
     packuswb    xmm0, xmm0                  ;pack to byte
 
 %if %1
@@ -54,7 +53,7 @@
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
     mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
+    mov         ecx, 0x01000100
 
     movdqa      xmm7, [rdx]                 ;load filters
     psrldq      xmm7, 6
@@ -62,7 +61,7 @@
     pshuflw     xmm7, xmm7, 0b              ;k3_k4
     punpcklwd   xmm7, xmm7
 
-    movq        xmm6, rcx                   ;rounding
+    movd        xmm6, ecx                   ;rounding_shift
     pshufd      xmm6, xmm6, 0
 
     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
@@ -74,8 +73,7 @@
     punpcklbw   xmm0, xmm1
     pmaddubsw   xmm0, xmm7
 
-    paddsw      xmm0, xmm6                  ;rounding
-    psraw       xmm0, 7                     ;shift
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
     packuswb    xmm0, xmm0                  ;pack back to byte
 
 %if %1
@@ -95,10 +93,8 @@
     pmaddubsw   xmm0, xmm7
     pmaddubsw   xmm2, xmm7
 
-    paddsw      xmm0, xmm6                  ;rounding
-    paddsw      xmm2, xmm6
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    pmulhrsw    xmm2, xmm6
     packuswb    xmm0, xmm2                  ;pack back to byte
 
 %if %1

diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 94a579c..f25af4c 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c

@@ -11,6 +11,8 @@
 
 #include <emmintrin.h>
 
+#include "aom_dsp/x86/synonyms.h"
+
 #include "./aom_dsp_rtcd.h"
 #include "aom_ports/mem.h"
 
@@ -122,13 +124,14 @@
 unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
+
   u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0);
+  s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0);
   s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 2 * p), u0);
   s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 3 * p), u0);
   s0 = _mm_adds_epu16(s0, s1);
 
   s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));

diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index 5bc8a52..87ff34b 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h

@@ -16,6 +16,7 @@
 #include "./aom_config.h"
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
+#include "aom_dsp/aom_convolve.h"
 
 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
@@ -32,7 +33,7 @@
     (void)y_step_q4;                                                         \
     assert(filter[3] != 128);                                                \
     assert(step_q4 == 16);                                                   \
-    if (filter[0] || filter[1] || filter[2]) {                               \
+    if (filter[0] | filter[1] | filter[2]) {                                 \
       while (w >= 16) {                                                      \
         aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
                                                  dst_stride, h, filter);     \
@@ -40,19 +41,12 @@
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
-      while (w >= 8) {                                                       \
+      if (w == 8) {                                                          \
         aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
+      } else if (w == 4) {                                                   \
         aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
                                                 dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
       }                                                                      \
     } else {                                                                 \
       while (w >= 16) {                                                      \
@@ -62,50 +56,45 @@
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
-      while (w >= 8) {                                                       \
+      if (w == 8) {                                                          \
         aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
                                                 dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
+      } else if (w == 4) {                                                   \
         aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
                                                 dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
       }                                                                      \
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                 \
-  void aom_convolve8_##avg##opt(                                              \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                 \
-    assert(filter_x[3] != 128);                                               \
-    assert(filter_y[3] != 128);                                               \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    assert(x_step_q4 == 16);                                                  \
-    assert(y_step_q4 == 16);                                                  \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] ||           \
-        filter_y[1] || filter_y[2]) {                                         \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                          \
-      aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w,  \
-                                h + 7);                                       \
-      aom_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,   \
-                                      filter_x, x_step_q4, filter_y,          \
-                                      y_step_q4, w, h);                       \
-    } else {                                                                  \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                          \
-      aom_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x,        \
-                                x_step_q4, filter_y, y_step_q4, w, h + 1);    \
-      aom_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x,  \
-                                      x_step_q4, filter_y, y_step_q4, w, h);  \
-    }                                                                         \
+#define FUN_CONV_2D(avg, opt)                                                \
+  void aom_convolve8_##avg##opt(                                             \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    assert(filter_x[3] != 128);                                              \
+    assert(filter_y[3] != 128);                                              \
+    assert(w <= MAX_SB_SIZE);                                                \
+    assert(h <= MAX_SB_SIZE);                                                \
+    assert(x_step_q4 == 16);                                                 \
+    assert(y_step_q4 == 16);                                                 \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_y[0] ||          \
+        filter_y[1] || filter_y[2]) {                                        \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
+      aom_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2,    \
+                                MAX_SB_SIZE, filter_x, x_step_q4, filter_y,  \
+                                y_step_q4, w, h + 7);                        \
+      aom_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
+                                      dst, dst_stride, filter_x, x_step_q4,  \
+                                      filter_y, y_step_q4, w, h);            \
+    } else {                                                                 \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]); \
+      aom_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE,        \
+                                filter_x, x_step_q4, filter_y, y_step_q4, w, \
+                                h + 1);                                      \
+      aom_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride,  \
+                                      filter_x, x_step_q4, filter_y,         \
+                                      y_step_q4, w, h);                      \
+    }                                                                        \
   }
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -125,7 +114,7 @@
     if (step_q4 == 16 && filter[3] != 128) {                              \
       uint16_t *src = CONVERT_TO_SHORTPTR(src8);                          \
       uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                          \
-      if (filter[0] || filter[1] || filter[2]) {                          \
+      if (filter[0] | filter[1] | filter[2]) {                            \
         while (w >= 16) {                                                 \
           aom_highbd_filter_block1d16_##dir##8_##avg##opt(                \
               src_start, src_stride, dst, dst_stride, h, filter, bd);     \
@@ -183,26 +172,29 @@
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
       ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
       const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
+    assert(w <= MAX_SB_SIZE);                                                 \
+    assert(h <= MAX_SB_SIZE);                                                 \
     if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
       if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 ||  \
           filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                       \
-        aom_highbd_convolve8_horiz_##opt(                                     \
-            src - 3 * src_stride, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, \
-            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 7, bd);          \
+        DECLARE_ALIGNED(16, uint16_t,                                         \
+                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);             \
+        aom_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \
+                                         CONVERT_TO_BYTEPTR(fdata2),          \
+                                         MAX_SB_SIZE, filter_x, x_step_q4,    \
+                                         filter_y, y_step_q4, w, h + 7, bd);  \
         aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2) + 192, 64, dst, dst_stride, filter_x,  \
-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+            CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, MAX_SB_SIZE, dst,   \
+            dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);  \
       } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                       \
+        DECLARE_ALIGNED(16, uint16_t,                                         \
+                        fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 1)]);             \
         aom_highbd_convolve8_horiz_##opt(                                     \
-            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), 64, filter_x,        \
-            x_step_q4, filter_y, y_step_q4, w, h + 1, bd);                    \
+            src, src_stride, CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE,         \
+            filter_x, x_step_q4, filter_y, y_step_q4, w, h + 1, bd);          \
         aom_highbd_convolve8_##avg##vert_##opt(                               \
-            CONVERT_TO_BYTEPTR(fdata2), 64, dst, dst_stride, filter_x,        \
-            x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+            CONVERT_TO_BYTEPTR(fdata2), MAX_SB_SIZE, dst, dst_stride,         \
+            filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);              \
       }                                                                       \
     } else {                                                                  \
       aom_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \

diff --git a/aom_dsp/x86/deblock_sse2.asm b/aom_dsp/x86/deblock_sse2.asm
new file mode 100644
index 0000000..bae6cf4
--- /dev/null
+++ b/aom_dsp/x86/deblock_sse2.asm

@@ -0,0 +1,661 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;macro in deblock functions
+%macro FIRST_2_ROWS 0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm5,       xmm1
+        pavgb       xmm5,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm4,       xmm1
+        psubusb     xmm1,       xmm0
+        psubusb     xmm6,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm4,       xmm1
+        paddusb     xmm6,       xmm3
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm7,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm4
+        psubusb     xmm7,       xmm6
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm7,       xmm1
+        por         xmm7,       xmm2
+%endmacro
+
+%macro SECOND_2_ROWS 0
+        movdqa      xmm6,       xmm0
+        movdqa      xmm4,       xmm0
+        movdqa      xmm2,       xmm1
+        pavgb       xmm1,       xmm3
+
+        ;calculate absolute value
+        psubusb     xmm6,       xmm2
+        psubusb     xmm2,       xmm0
+        psubusb     xmm4,       xmm3
+        psubusb     xmm3,       xmm0
+        paddusb     xmm6,       xmm2
+        paddusb     xmm4,       xmm3
+
+        pavgb       xmm5,       xmm1
+
+        ;get threshold
+        movdqa      xmm2,       flimit
+        pxor        xmm1,       xmm1
+        movdqa      xmm3,       xmm2
+
+        ;get mask
+        psubusb     xmm2,       xmm6
+        psubusb     xmm3,       xmm4
+        pcmpeqb     xmm2,       xmm1
+        pcmpeqb     xmm3,       xmm1
+
+        por         xmm7,       xmm2
+        por         xmm7,       xmm3
+
+        pavgb       xmm5,       xmm0
+
+        ;decide if or not to use filtered value
+        pand        xmm0,       xmm7
+        pandn       xmm7,       xmm5
+        paddusb     xmm0,       xmm7
+%endmacro
+
+%macro UPDATE_FLIMIT 0
+        movdqa      xmm2,       XMMWORD PTR [rbx]
+        movdqa      [rsp],      xmm2
+        add         rbx,        16
+%endmacro
+
+;void aom_post_proc_down_and_across_mb_row_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned char *dst_ptr,
+;    int src_pixels_per_line,
+;    int dst_pixels_per_line,
+;    int cols,
+;    int *flimits,
+;    int size
+;)
+global sym(aom_post_proc_down_and_across_mb_row_sse2) PRIVATE
+sym(aom_post_proc_down_and_across_mb_row_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+        ; put flimit on stack
+        mov         rbx,        arg(5)           ;flimits ptr
+        UPDATE_FLIMIT
+
+%define flimit [rsp]
+
+        mov         rsi,        arg(0)           ;src_ptr
+        mov         rdi,        arg(1)           ;dst_ptr
+
+        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
+        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
+.nextrow:
+        xor         rdx,        rdx              ;col
+.nextcol:
+        ;load current and next 2 rows
+        movdqu      xmm0,       XMMWORD PTR [rsi]
+        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
+
+        FIRST_2_ROWS
+
+        ;load above 2 rows
+        neg         rax
+        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
+        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
+
+        SECOND_2_ROWS
+
+        movdqu      XMMWORD PTR [rdi], xmm0
+
+        neg         rax                          ; positive stride
+        add         rsi,        16
+        add         rdi,        16
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .downdone
+        UPDATE_FLIMIT
+        jmp         .nextcol
+
+.downdone:
+        ; done with the all cols, start the across filtering in place
+        sub         rsi,        rdx
+        sub         rdi,        rdx
+
+        mov         rbx,        arg(5) ; flimits
+        UPDATE_FLIMIT
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rdi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        mov         rdx,    -8
+        movq        [rdi+rdx], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(4)
+        movq        mm1,   [rdi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rdi+rdx], mm1
+
+        xor         rdx,        rdx
+        movq        mm0,        QWORD PTR [rdi-16];
+        movq        mm1,        QWORD PTR [rdi-8];
+
+.acrossnextcol:
+        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
+
+        FIRST_2_ROWS
+
+        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
+        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
+
+        SECOND_2_ROWS
+
+        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
+        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
+        movdq2q     mm0,        xmm0
+        psrldq      xmm0,       8
+        movdq2q     mm1,        xmm0
+
+        add         rdx,        16
+        cmp         edx,        dword arg(4)     ;cols
+        jge         .acrossdone
+        UPDATE_FLIMIT
+        jmp         .acrossnextcol
+
+.acrossdone:
+        ; last 16 pixels
+        movq        QWORD PTR [rdi+rdx-16], mm0
+
+        cmp         edx,        dword arg(4)
+        jne         .throw_last_8
+        movq        QWORD PTR [rdi+rdx-8], mm1
+.throw_last_8:
+        ; done with this rwo
+        add         rsi,rax                      ;next src line
+        mov         eax, dword arg(3)            ;dst_pixels_per_line
+        add         rdi,rax                      ;next destination
+        mov         eax, dword arg(2)            ;src_pixels_per_line
+
+        mov         rbx,        arg(5)           ;flimits
+        UPDATE_FLIMIT
+
+        dec         rcx                          ;decrement count
+        jnz         .nextrow                     ;next row
+
+    add rsp, 16
+    pop rsp
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit
+
+;void aom_mbpost_proc_down_xmm(unsigned char *dst,
+;                            int pitch, int rows, int cols,int flimit)
+extern sym(aom_rv)
+global sym(aom_mbpost_proc_down_xmm) PRIVATE
+sym(aom_mbpost_proc_down_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 128+16
+
+    ; unsigned char d[16][8] at [rsp]
+    ; create flimit2 at [rsp+128]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp+128], eax
+    mov         [rsp+128+4], eax
+    mov         [rsp+128+8], eax
+    mov         [rsp+128+12], eax
+%define flimit4 [rsp+128]
+
+%if ABI_IS_32BIT=0
+    lea         r8,       [GLOBAL(sym(aom_rv))]
+%endif
+
+    ;rows +=8;
+    add         dword arg(2), 8
+
+    ;for(c=0; c<cols; c+=8)
+.loop_col:
+            mov         rsi,        arg(0) ; s
+            pxor        xmm0,       xmm0        ;
+
+            movsxd      rax,        dword ptr arg(1) ;pitch       ;
+
+            ; this copies the last row down into the border 8 rows
+            mov         rdi,        rsi
+            mov         rdx,        arg(2)
+            sub         rdx,        9
+            imul        rdx,        rax
+            lea         rdi,        [rdi+rdx]
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_borderd:                                                  ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_borderd
+
+            neg         rax                                     ; rax = -pitch
+
+            ; this copies the first row up into the border 8 rows
+            mov         rdi,        rsi
+            movq        xmm1,       QWORD ptr[rdi]              ; first row
+            mov         rcx,        8
+.init_border:                                                   ; initialize borders
+            lea         rdi,        [rdi + rax]
+            movq        [rdi],      xmm1
+
+            dec         rcx
+            jne         .init_border
+
+
+
+            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
+            neg         rax
+
+            pxor        xmm5,       xmm5
+            pxor        xmm6,       xmm6        ;
+
+            pxor        xmm7,       xmm7        ;
+            mov         rdi,        rsi
+
+            mov         rcx,        15          ;
+
+.loop_initvar:
+            movq        xmm1,       QWORD PTR [rdi];
+            punpcklbw   xmm1,       xmm0        ;
+
+            paddw       xmm5,       xmm1        ;
+            pmullw      xmm1,       xmm1        ;
+
+            movdqa      xmm2,       xmm1        ;
+            punpcklwd   xmm1,       xmm0        ;
+
+            punpckhwd   xmm2,       xmm0        ;
+            paddd       xmm6,       xmm1        ;
+
+            paddd       xmm7,       xmm2        ;
+            lea         rdi,        [rdi+rax]   ;
+
+            dec         rcx
+            jne         .loop_initvar
+            ;save the var and sum
+            xor         rdx,        rdx
+.loop_row:
+            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
+            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
+
+            punpcklbw   xmm1,       xmm0
+            punpcklbw   xmm2,       xmm0
+
+            paddw       xmm5,       xmm2
+            psubw       xmm5,       xmm1
+
+            pmullw      xmm2,       xmm2
+            movdqa      xmm4,       xmm2
+
+            punpcklwd   xmm2,       xmm0
+            punpckhwd   xmm4,       xmm0
+
+            paddd       xmm6,       xmm2
+            paddd       xmm7,       xmm4
+
+            pmullw      xmm1,       xmm1
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm0
+            psubd       xmm6,       xmm1
+
+            punpckhwd   xmm2,       xmm0
+            psubd       xmm7,       xmm2
+
+
+            movdqa      xmm3,       xmm6
+            pslld       xmm3,       4
+
+            psubd       xmm3,       xmm6
+            movdqa      xmm1,       xmm5
+
+            movdqa      xmm4,       xmm5
+            pmullw      xmm1,       xmm1
+
+            pmulhw      xmm4,       xmm4
+            movdqa      xmm2,       xmm1
+
+            punpcklwd   xmm1,       xmm4
+            punpckhwd   xmm2,       xmm4
+
+            movdqa      xmm4,       xmm7
+            pslld       xmm4,       4
+
+            psubd       xmm4,       xmm7
+
+            psubd       xmm3,       xmm1
+            psubd       xmm4,       xmm2
+
+            psubd       xmm3,       flimit4
+            psubd       xmm4,       flimit4
+
+            psrad       xmm3,       31
+            psrad       xmm4,       31
+
+            packssdw    xmm3,       xmm4
+            packsswb    xmm3,       xmm0
+
+            movq        xmm1,       QWORD PTR [rsi+rax*8]
+
+            movq        xmm2,       xmm1
+            punpcklbw   xmm1,       xmm0
+
+            paddw       xmm1,       xmm5
+            mov         rcx,        rdx
+
+            and         rcx,        127
+%if ABI_IS_32BIT=1 && CONFIG_PIC=1
+            push        rax
+            lea         rax,        [GLOBAL(sym(aom_rv))]
+            movdqu      xmm4,       [rax + rcx*2] ;aom_rv[rcx*2]
+            pop         rax
+%elif ABI_IS_32BIT=0
+            movdqu      xmm4,       [r8 + rcx*2] ;aom_rv[rcx*2]
+%else
+            movdqu      xmm4,       [sym(aom_rv) + rcx*2]
+%endif
+
+            paddw       xmm1,       xmm4
+            ;paddw     xmm1,       eight8s
+            psraw       xmm1,       4
+
+            packuswb    xmm1,       xmm0
+            pand        xmm1,       xmm3
+
+            pandn       xmm3,       xmm2
+            por         xmm1,       xmm3
+
+            and         rcx,        15
+            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
+
+            cmp         edx,        8
+            jl          .skip_assignment
+
+            mov         rcx,        rdx
+            sub         rcx,        8
+            and         rcx,        15
+            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
+            movq        [rsi],      mm0
+
+.skip_assignment:
+            lea         rsi,        [rsi+rax]
+
+            lea         rdi,        [rdi+rax]
+            add         rdx,        1
+
+            cmp         edx,        dword arg(2) ;rows
+            jl          .loop_row
+
+        add         dword arg(0), 8 ; s += 8
+        sub         dword arg(3), 8 ; cols -= 8
+        cmp         dword arg(3), 0
+        jg          .loop_col
+
+    add         rsp, 128+16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+;void aom_mbpost_proc_across_ip_xmm(unsigned char *src,
+;                                int pitch, int rows, int cols,int flimit)
+global sym(aom_mbpost_proc_across_ip_xmm) PRIVATE
+sym(aom_mbpost_proc_across_ip_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16
+
+    ; create flimit4 at [rsp]
+    mov         eax, dword ptr arg(4) ;flimit
+    mov         [rsp], eax
+    mov         [rsp+4], eax
+    mov         [rsp+8], eax
+    mov         [rsp+12], eax
+%define flimit4 [rsp]
+
+
+    ;for(r=0;r<rows;r++)
+.ip_row_loop:
+
+        xor         rdx,    rdx ;sumsq=0;
+        xor         rcx,    rcx ;sum=0;
+        mov         rsi,    arg(0); s
+
+
+        ; dup the first byte into the left border 8 times
+        movq        mm1,   [rsi]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+
+        mov         rdi,    -8
+        movq        [rsi+rdi], mm1
+
+        ; dup the last byte into the right border
+        movsxd      rdx,    dword arg(3)
+        movq        mm1,   [rsi + rdx + -1]
+        punpcklbw   mm1,   mm1
+        punpcklwd   mm1,   mm1
+        punpckldq   mm1,   mm1
+        movq        [rsi+rdx], mm1
+
+.ip_var_loop:
+        ;for(i=-8;i<=6;i++)
+        ;{
+        ;    sumsq += s[i]*s[i];
+        ;    sum   += s[i];
+        ;}
+        movzx       eax, byte [rsi+rdi]
+        add         ecx, eax
+        mul         al
+        add         edx, eax
+        add         rdi, 1
+        cmp         rdi, 6
+        jle         .ip_var_loop
+
+
+            ;mov         rax,    sumsq
+            ;movd        xmm7,   rax
+            movd        xmm7,   edx
+
+            ;mov         rax,    sum
+            ;movd        xmm6,   rax
+            movd        xmm6,   ecx
+
+            mov         rsi,    arg(0) ;s
+            xor         rcx,    rcx
+
+            movsxd      rdx,    dword arg(3) ;cols
+            add         rdx,    8
+            pxor        mm0,    mm0
+            pxor        mm1,    mm1
+
+            pxor        xmm0,   xmm0
+.nextcol4:
+
+            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
+            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
+
+            punpcklbw   xmm1,   xmm0                    ; expanding
+            punpcklbw   xmm2,   xmm0                    ; expanding
+
+            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
+            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
+
+            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
+            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
+
+            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
+            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
+
+            paddd       xmm6,   xmm2
+            paddd       xmm7,   xmm1
+
+            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
+            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
+
+            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
+            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
+
+            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
+
+            paddd       xmm6,   xmm4
+            paddd       xmm7,   xmm3
+
+            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
+            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
+            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
+
+            paddd       xmm7,   xmm3
+            paddd       xmm6,   xmm4
+
+            movdqa      xmm3,   xmm6
+            pmaddwd     xmm3,   xmm3
+
+            movdqa      xmm5,   xmm7
+            pslld       xmm5,   4
+
+            psubd       xmm5,   xmm7
+            psubd       xmm5,   xmm3
+
+            psubd       xmm5,   flimit4
+            psrad       xmm5,   31
+
+            packssdw    xmm5,   xmm0
+            packsswb    xmm5,   xmm0
+
+            movd        xmm1,   DWORD PTR [rsi+rcx]
+            movq        xmm2,   xmm1
+
+            punpcklbw   xmm1,   xmm0
+            punpcklwd   xmm1,   xmm0
+
+            paddd       xmm1,   xmm6
+            paddd       xmm1,   [GLOBAL(four8s)]
+
+            psrad       xmm1,   4
+            packssdw    xmm1,   xmm0
+
+            packuswb    xmm1,   xmm0
+            pand        xmm1,   xmm5
+
+            pandn       xmm5,   xmm2
+            por         xmm5,   xmm1
+
+            movd        [rsi+rcx-8],  mm0
+            movq        mm0,    mm1
+
+            movdq2q     mm1,    xmm5
+            psrldq      xmm7,   12
+
+            psrldq      xmm6,   12
+            add         rcx,    4
+
+            cmp         rcx,    rdx
+            jl          .nextcol4
+
+        ;s+=pitch;
+        movsxd rax, dword arg(1)
+        add    arg(0), rax
+
+        sub dword arg(2), 1 ;rows-=1
+        cmp dword arg(2), 0
+        jg .ip_row_loop
+
+    add         rsp, 16
+    pop         rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+%undef flimit4
+
+
+SECTION_RODATA
+align 16
+four8s:
+    times 4 dd 8

diff --git a/aom_dsp/x86/fwd_dct32_8cols_sse2.c b/aom_dsp/x86/fwd_dct32_8cols_sse2.c
new file mode 100644
index 0000000..b8ec08d
--- /dev/null
+++ b/aom_dsp/x86/fwd_dct32_8cols_sse2.c

@@ -0,0 +1,862 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "aom_dsp/fwd_txfm.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+// Apply a 32-element IDCT to 8 columns. This does not do any transposition
+// of its output - the caller is expected to do that.
+// The input buffers are the top and bottom halves of an 8x32 block.
+void fdct32_8col(__m128i *in0, __m128i *in1) {
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
+  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
+  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
+  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
+  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
+  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
+  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
+  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
+  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i step1[32];
+  __m128i step2[32];
+  __m128i step3[32];
+  __m128i out[32];
+  // Stage 1
+  {
+    const __m128i *ina = in0;
+    const __m128i *inb = in1 + 15;
+    __m128i *step1a = &step1[0];
+    __m128i *step1b = &step1[31];
+    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+    step1a[0] = _mm_add_epi16(ina0, inb0);
+    step1a[1] = _mm_add_epi16(ina1, inb1);
+    step1a[2] = _mm_add_epi16(ina2, inb2);
+    step1a[3] = _mm_add_epi16(ina3, inb3);
+    step1b[-3] = _mm_sub_epi16(ina3, inb3);
+    step1b[-2] = _mm_sub_epi16(ina2, inb2);
+    step1b[-1] = _mm_sub_epi16(ina1, inb1);
+    step1b[-0] = _mm_sub_epi16(ina0, inb0);
+  }
+  {
+    const __m128i *ina = in0 + 4;
+    const __m128i *inb = in1 + 11;
+    __m128i *step1a = &step1[4];
+    __m128i *step1b = &step1[27];
+    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+    step1a[0] = _mm_add_epi16(ina0, inb0);
+    step1a[1] = _mm_add_epi16(ina1, inb1);
+    step1a[2] = _mm_add_epi16(ina2, inb2);
+    step1a[3] = _mm_add_epi16(ina3, inb3);
+    step1b[-3] = _mm_sub_epi16(ina3, inb3);
+    step1b[-2] = _mm_sub_epi16(ina2, inb2);
+    step1b[-1] = _mm_sub_epi16(ina1, inb1);
+    step1b[-0] = _mm_sub_epi16(ina0, inb0);
+  }
+  {
+    const __m128i *ina = in0 + 8;
+    const __m128i *inb = in1 + 7;
+    __m128i *step1a = &step1[8];
+    __m128i *step1b = &step1[23];
+    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+    step1a[0] = _mm_add_epi16(ina0, inb0);
+    step1a[1] = _mm_add_epi16(ina1, inb1);
+    step1a[2] = _mm_add_epi16(ina2, inb2);
+    step1a[3] = _mm_add_epi16(ina3, inb3);
+    step1b[-3] = _mm_sub_epi16(ina3, inb3);
+    step1b[-2] = _mm_sub_epi16(ina2, inb2);
+    step1b[-1] = _mm_sub_epi16(ina1, inb1);
+    step1b[-0] = _mm_sub_epi16(ina0, inb0);
+  }
+  {
+    const __m128i *ina = in0 + 12;
+    const __m128i *inb = in1 + 3;
+    __m128i *step1a = &step1[12];
+    __m128i *step1b = &step1[19];
+    const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
+    const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
+    const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
+    const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
+    const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
+    const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
+    const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
+    const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
+    step1a[0] = _mm_add_epi16(ina0, inb0);
+    step1a[1] = _mm_add_epi16(ina1, inb1);
+    step1a[2] = _mm_add_epi16(ina2, inb2);
+    step1a[3] = _mm_add_epi16(ina3, inb3);
+    step1b[-3] = _mm_sub_epi16(ina3, inb3);
+    step1b[-2] = _mm_sub_epi16(ina2, inb2);
+    step1b[-1] = _mm_sub_epi16(ina1, inb1);
+    step1b[-0] = _mm_sub_epi16(ina0, inb0);
+  }
+  // Stage 2
+  {
+    step2[0] = _mm_add_epi16(step1[0], step1[15]);
+    step2[1] = _mm_add_epi16(step1[1], step1[14]);
+    step2[2] = _mm_add_epi16(step1[2], step1[13]);
+    step2[3] = _mm_add_epi16(step1[3], step1[12]);
+    step2[4] = _mm_add_epi16(step1[4], step1[11]);
+    step2[5] = _mm_add_epi16(step1[5], step1[10]);
+    step2[6] = _mm_add_epi16(step1[6], step1[9]);
+    step2[7] = _mm_add_epi16(step1[7], step1[8]);
+    step2[8] = _mm_sub_epi16(step1[7], step1[8]);
+    step2[9] = _mm_sub_epi16(step1[6], step1[9]);
+    step2[10] = _mm_sub_epi16(step1[5], step1[10]);
+    step2[11] = _mm_sub_epi16(step1[4], step1[11]);
+    step2[12] = _mm_sub_epi16(step1[3], step1[12]);
+    step2[13] = _mm_sub_epi16(step1[2], step1[13]);
+    step2[14] = _mm_sub_epi16(step1[1], step1[14]);
+    step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+  }
+  {
+    const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
+    const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
+    const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
+    const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
+    const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
+    const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
+    const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
+    const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
+    const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
+    const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
+    const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
+    const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
+    const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
+    const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
+    const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
+    const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
+    const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
+    const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
+    const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
+    const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
+    const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
+    const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
+    const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
+    const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
+    // dct_const_round_shift
+    const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
+    const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
+    const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
+    const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
+    const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
+    const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
+    const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
+    const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
+    const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
+    const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
+    const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
+    const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
+    const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
+    const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
+    const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
+    const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
+    // Combine
+    step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
+    step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
+    step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
+    step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
+    step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
+    step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
+    step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
+    step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+  }
+  // Stage 3
+  {
+    step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
+    step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
+    step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
+    step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
+    step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
+    step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
+    step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
+    step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+  }
+  {
+    const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
+    const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
+    const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
+    const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
+    const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
+    const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
+    const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
+    const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
+    const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
+    const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
+    const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
+    const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
+    // dct_const_round_shift
+    const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
+    const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
+    const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
+    const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
+    const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
+    const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
+    const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
+    const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
+    // Combine
+    step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
+    step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
+    step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
+    step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+  }
+  {
+    step3[16] = _mm_add_epi16(step2[23], step1[16]);
+    step3[17] = _mm_add_epi16(step2[22], step1[17]);
+    step3[18] = _mm_add_epi16(step2[21], step1[18]);
+    step3[19] = _mm_add_epi16(step2[20], step1[19]);
+    step3[20] = _mm_sub_epi16(step1[19], step2[20]);
+    step3[21] = _mm_sub_epi16(step1[18], step2[21]);
+    step3[22] = _mm_sub_epi16(step1[17], step2[22]);
+    step3[23] = _mm_sub_epi16(step1[16], step2[23]);
+    step3[24] = _mm_sub_epi16(step1[31], step2[24]);
+    step3[25] = _mm_sub_epi16(step1[30], step2[25]);
+    step3[26] = _mm_sub_epi16(step1[29], step2[26]);
+    step3[27] = _mm_sub_epi16(step1[28], step2[27]);
+    step3[28] = _mm_add_epi16(step2[27], step1[28]);
+    step3[29] = _mm_add_epi16(step2[26], step1[29]);
+    step3[30] = _mm_add_epi16(step2[25], step1[30]);
+    step3[31] = _mm_add_epi16(step2[24], step1[31]);
+  }
+
+  // Stage 4
+  {
+    step1[0] = _mm_add_epi16(step3[3], step3[0]);
+    step1[1] = _mm_add_epi16(step3[2], step3[1]);
+    step1[2] = _mm_sub_epi16(step3[1], step3[2]);
+    step1[3] = _mm_sub_epi16(step3[0], step3[3]);
+    step1[8] = _mm_add_epi16(step3[11], step2[8]);
+    step1[9] = _mm_add_epi16(step3[10], step2[9]);
+    step1[10] = _mm_sub_epi16(step2[9], step3[10]);
+    step1[11] = _mm_sub_epi16(step2[8], step3[11]);
+    step1[12] = _mm_sub_epi16(step2[15], step3[12]);
+    step1[13] = _mm_sub_epi16(step2[14], step3[13]);
+    step1[14] = _mm_add_epi16(step3[13], step2[14]);
+    step1[15] = _mm_add_epi16(step3[12], step2[15]);
+  }
+  {
+    const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
+    const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
+    const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
+    const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
+    const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
+    const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
+    // dct_const_round_shift
+    const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
+    const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
+    const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
+    const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
+    // Combine
+    step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
+    step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+  }
+  {
+    const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
+    const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
+    const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
+    const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
+    const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
+    const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
+    const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
+    const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
+    const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
+    const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
+    const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
+    const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
+    const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
+    const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
+    const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
+    const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
+    const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
+    const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
+    const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
+    const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
+    const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
+    const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
+    const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
+    const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
+    // dct_const_round_shift
+    const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+    const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+    const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
+    const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
+    const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
+    const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
+    const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
+    const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
+    const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
+    const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
+    const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
+    const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
+    const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
+    const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
+    const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
+    const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
+    const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
+    const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
+    // Combine
+    step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
+    step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
+    step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
+    step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
+    step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
+    step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
+    step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
+    step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+  }
+  // Stage 5
+  {
+    step2[4] = _mm_add_epi16(step1[5], step3[4]);
+    step2[5] = _mm_sub_epi16(step3[4], step1[5]);
+    step2[6] = _mm_sub_epi16(step3[7], step1[6]);
+    step2[7] = _mm_add_epi16(step1[6], step3[7]);
+  }
+  {
+    const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
+    const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
+    const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
+    const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
+    const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
+    const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
+    const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
+    const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
+    const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
+    const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
+    const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
+    const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
+    // dct_const_round_shift
+    const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
+    const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
+    const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
+    const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
+    const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
+    const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
+    const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
+    const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
+    // Combine
+    out[0] = _mm_packs_epi32(out_00_6, out_00_7);
+    out[16] = _mm_packs_epi32(out_16_6, out_16_7);
+    out[8] = _mm_packs_epi32(out_08_6, out_08_7);
+    out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+  }
+  {
+    const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
+    const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
+    const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
+    const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
+    const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
+    const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
+    const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
+    const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
+    const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
+    const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
+    const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
+    const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
+    // dct_const_round_shift
+    const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+    const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+    const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
+    const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
+    const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
+    const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
+    const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
+    const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
+    const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
+    const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
+    // Combine
+    step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
+    step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
+    step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
+    step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+  }
+  {
+    step2[16] = _mm_add_epi16(step1[19], step3[16]);
+    step2[17] = _mm_add_epi16(step1[18], step3[17]);
+    step2[18] = _mm_sub_epi16(step3[17], step1[18]);
+    step2[19] = _mm_sub_epi16(step3[16], step1[19]);
+    step2[20] = _mm_sub_epi16(step3[23], step1[20]);
+    step2[21] = _mm_sub_epi16(step3[22], step1[21]);
+    step2[22] = _mm_add_epi16(step1[21], step3[22]);
+    step2[23] = _mm_add_epi16(step1[20], step3[23]);
+    step2[24] = _mm_add_epi16(step1[27], step3[24]);
+    step2[25] = _mm_add_epi16(step1[26], step3[25]);
+    step2[26] = _mm_sub_epi16(step3[25], step1[26]);
+    step2[27] = _mm_sub_epi16(step3[24], step1[27]);
+    step2[28] = _mm_sub_epi16(step3[31], step1[28]);
+    step2[29] = _mm_sub_epi16(step3[30], step1[29]);
+    step2[30] = _mm_add_epi16(step1[29], step3[30]);
+    step2[31] = _mm_add_epi16(step1[28], step3[31]);
+  }
+  // Stage 6
+  {
+    const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+    const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+    const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+    const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+    const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
+    const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
+    const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
+    const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
+    const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
+    const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
+    const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
+    const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
+    const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
+    const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
+    const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
+    const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
+    // dct_const_round_shift
+    const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
+    const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
+    const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
+    const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
+    const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
+    const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
+    const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
+    const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
+    // Combine
+    out[4] = _mm_packs_epi32(out_04_6, out_04_7);
+    out[20] = _mm_packs_epi32(out_20_6, out_20_7);
+    out[12] = _mm_packs_epi32(out_12_6, out_12_7);
+    out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+  }
+  {
+    step3[8] = _mm_add_epi16(step2[9], step1[8]);
+    step3[9] = _mm_sub_epi16(step1[8], step2[9]);
+    step3[10] = _mm_sub_epi16(step1[11], step2[10]);
+    step3[11] = _mm_add_epi16(step2[10], step1[11]);
+    step3[12] = _mm_add_epi16(step2[13], step1[12]);
+    step3[13] = _mm_sub_epi16(step1[12], step2[13]);
+    step3[14] = _mm_sub_epi16(step1[15], step2[14]);
+    step3[15] = _mm_add_epi16(step2[14], step1[15]);
+  }
+  {
+    const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
+    const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
+    const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
+    const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
+    const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
+    const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
+    const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
+    const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
+    const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
+    const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
+    const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
+    const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
+    const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
+    const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
+    const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
+    const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
+    const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
+    const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
+    const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
+    const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
+    const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
+    const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
+    const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
+    const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
+    // dct_const_round_shift
+    const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
+    const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
+    const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
+    const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
+    const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
+    const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
+    const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
+    const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
+    const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+    const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+    const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
+    const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
+    const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
+    const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
+    const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
+    const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
+    const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
+    const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
+    // Combine
+    step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
+    step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
+    step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
+    step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
+    // Combine
+    step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
+    step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
+    step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
+    step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+  }
+  // Stage 7
+  {
+    const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
+    const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
+    const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
+    const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
+    const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
+    const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
+    const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
+    const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
+    const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
+    const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
+    const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
+    const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
+    const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
+    const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
+    const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
+    const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
+    const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
+    const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
+    const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
+    const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
+    const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
+    const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
+    const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
+    const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
+    // dct_const_round_shift
+    const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
+    const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
+    const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
+    const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
+    const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
+    const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
+    const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
+    const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
+    const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
+    const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
+    const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
+    const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
+    const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
+    const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
+    const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
+    const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
+    // Combine
+    out[2] = _mm_packs_epi32(out_02_6, out_02_7);
+    out[18] = _mm_packs_epi32(out_18_6, out_18_7);
+    out[10] = _mm_packs_epi32(out_10_6, out_10_7);
+    out[26] = _mm_packs_epi32(out_26_6, out_26_7);
+    out[6] = _mm_packs_epi32(out_06_6, out_06_7);
+    out[22] = _mm_packs_epi32(out_22_6, out_22_7);
+    out[14] = _mm_packs_epi32(out_14_6, out_14_7);
+    out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+  }
+  {
+    step1[16] = _mm_add_epi16(step3[17], step2[16]);
+    step1[17] = _mm_sub_epi16(step2[16], step3[17]);
+    step1[18] = _mm_sub_epi16(step2[19], step3[18]);
+    step1[19] = _mm_add_epi16(step3[18], step2[19]);
+    step1[20] = _mm_add_epi16(step3[21], step2[20]);
+    step1[21] = _mm_sub_epi16(step2[20], step3[21]);
+    step1[22] = _mm_sub_epi16(step2[23], step3[22]);
+    step1[23] = _mm_add_epi16(step3[22], step2[23]);
+    step1[24] = _mm_add_epi16(step3[25], step2[24]);
+    step1[25] = _mm_sub_epi16(step2[24], step3[25]);
+    step1[26] = _mm_sub_epi16(step2[27], step3[26]);
+    step1[27] = _mm_add_epi16(step3[26], step2[27]);
+    step1[28] = _mm_add_epi16(step3[29], step2[28]);
+    step1[29] = _mm_sub_epi16(step2[28], step3[29]);
+    step1[30] = _mm_sub_epi16(step2[31], step3[30]);
+    step1[31] = _mm_add_epi16(step3[30], step2[31]);
+  }
+  // Final stage --- outputs indices are bit-reversed.
+  {
+    const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
+    const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
+    const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
+    const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
+    const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
+    const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
+    const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
+    const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
+    const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
+    const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
+    const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
+    const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
+    const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
+    const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
+    const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
+    const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
+    const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
+    const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
+    const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
+    const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
+    const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
+    const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
+    const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
+    const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
+    // dct_const_round_shift
+    const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
+    const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
+    const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
+    const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
+    const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
+    const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
+    const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
+    const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
+    const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
+    const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
+    const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
+    const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
+    const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
+    const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
+    const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
+    const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
+    // Combine
+    out[1] = _mm_packs_epi32(out_01_6, out_01_7);
+    out[17] = _mm_packs_epi32(out_17_6, out_17_7);
+    out[9] = _mm_packs_epi32(out_09_6, out_09_7);
+    out[25] = _mm_packs_epi32(out_25_6, out_25_7);
+    out[7] = _mm_packs_epi32(out_07_6, out_07_7);
+    out[23] = _mm_packs_epi32(out_23_6, out_23_7);
+    out[15] = _mm_packs_epi32(out_15_6, out_15_7);
+    out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+  }
+  {
+    const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
+    const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
+    const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
+    const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
+    const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
+    const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
+    const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
+    const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
+    const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
+    const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
+    const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
+    const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
+    const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
+    const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
+    const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
+    const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
+    const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
+    const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
+    const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
+    const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
+    const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
+    const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
+    const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
+    const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
+    // dct_const_round_shift
+    const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+    const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+    const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
+    const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
+    const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
+    const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
+    const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
+    const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
+    const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
+    const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
+    const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
+    const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
+    const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
+    const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
+    const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
+    const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
+    const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
+    const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
+    // Combine
+    out[5] = _mm_packs_epi32(out_05_6, out_05_7);
+    out[21] = _mm_packs_epi32(out_21_6, out_21_7);
+    out[13] = _mm_packs_epi32(out_13_6, out_13_7);
+    out[29] = _mm_packs_epi32(out_29_6, out_29_7);
+    out[3] = _mm_packs_epi32(out_03_6, out_03_7);
+    out[19] = _mm_packs_epi32(out_19_6, out_19_7);
+    out[11] = _mm_packs_epi32(out_11_6, out_11_7);
+    out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+  }
+
+  // Output results
+  {
+    int j;
+    for (j = 0; j < 16; ++j) {
+      _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
+      _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
+    }
+  }
+}  // NOLINT

diff --git a/aom_dsp/x86/fwd_dct32x32_impl_avx2.h b/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
index fc7d322..2167395 100644
--- a/aom_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/aom_dsp/x86/fwd_dct32x32_impl_avx2.h

@@ -12,16 +12,8 @@
 #include <immintrin.h>  // AVX2
 
 #include "aom_dsp/txfm_common.h"
-
-#define pair256_set_epi16(a, b)                                            \
-  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
-                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
-
-#define pair256_set_epi32(a, b)                                                \
-  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
-                   (int)(b), (int)(a))
+#include "aom_dsp/x86/txfm_common_intrin.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
 
 #if FDCT32x32_HIGH_PRECISION
 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
@@ -40,7 +32,19 @@
 }
 #endif
 
-void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
+#ifndef STORE_COEFF_FUNC
+#define STORE_COEFF_FUNC
+static void store_coeff(const __m256i *coeff, tran_low_t *curr,
+                        tran_low_t *next) {
+  __m128i u = _mm256_castsi256_si128(*coeff);
+  storeu_output(&u, curr);
+  u = _mm256_extractf128_si256(*coeff, 1);
+  storeu_output(&u, next);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input, tran_low_t *output_org,
+                       int stride) {
   // Calculate pre-multiplied strides
   const int str1 = stride;
   const int str2 = 2 * stride;
@@ -2851,13 +2855,14 @@
       {
         int transpose_block;
         int16_t *output_currStep, *output_nextStep;
-        if (0 == pass) {
-          output_currStep = &intermediate[column_start * 32];
-          output_nextStep = &intermediate[(column_start + 8) * 32];
-        } else {
-          output_currStep = &output_org[column_start * 32];
-          output_nextStep = &output_org[(column_start + 8) * 32];
-        }
+        tran_low_t *curr_out, *next_out;
+        // Pass 0
+        output_currStep = &intermediate[column_start * 32];
+        output_nextStep = &intermediate[(column_start + 8) * 32];
+        // Pass 1
+        curr_out = &output_org[column_start * 32];
+        next_out = &output_org[(column_start + 8) * 32];
+
         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
           __m256i *this_out = &out[8 * transpose_block];
           // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
@@ -2957,46 +2962,61 @@
             tr2_6 = _mm256_srai_epi16(tr2_6, 2);
             tr2_7 = _mm256_srai_epi16(tr2_7, 2);
           }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
-                           _mm256_castsi256_si128(tr2_0));
-          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
-                           _mm256_castsi256_si128(tr2_1));
-          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
-                           _mm256_castsi256_si128(tr2_2));
-          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
-                           _mm256_castsi256_si128(tr2_3));
-          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
-                           _mm256_castsi256_si128(tr2_4));
-          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
-                           _mm256_castsi256_si128(tr2_5));
-          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
-                           _mm256_castsi256_si128(tr2_6));
-          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
-                           _mm256_castsi256_si128(tr2_7));
+          if (0 == pass) {
+            // Note: even though all these stores are aligned, using the aligned
+            //       intrinsic make the code slightly slower.
+            _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32),
+                             _mm256_castsi256_si128(tr2_0));
+            _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32),
+                             _mm256_castsi256_si128(tr2_1));
+            _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32),
+                             _mm256_castsi256_si128(tr2_2));
+            _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32),
+                             _mm256_castsi256_si128(tr2_3));
+            _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32),
+                             _mm256_castsi256_si128(tr2_4));
+            _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32),
+                             _mm256_castsi256_si128(tr2_5));
+            _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32),
+                             _mm256_castsi256_si128(tr2_6));
+            _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32),
+                             _mm256_castsi256_si128(tr2_7));
 
-          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
-                           _mm256_extractf128_si256(tr2_0, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
-                           _mm256_extractf128_si256(tr2_1, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
-                           _mm256_extractf128_si256(tr2_2, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
-                           _mm256_extractf128_si256(tr2_3, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
-                           _mm256_extractf128_si256(tr2_4, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
-                           _mm256_extractf128_si256(tr2_5, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
-                           _mm256_extractf128_si256(tr2_6, 1));
-          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
-                           _mm256_extractf128_si256(tr2_7, 1));
-          // Process next 8x8
-          output_currStep += 8;
-          output_nextStep += 8;
+            _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32),
+                             _mm256_extractf128_si256(tr2_0, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32),
+                             _mm256_extractf128_si256(tr2_1, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32),
+                             _mm256_extractf128_si256(tr2_2, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32),
+                             _mm256_extractf128_si256(tr2_3, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32),
+                             _mm256_extractf128_si256(tr2_4, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32),
+                             _mm256_extractf128_si256(tr2_5, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32),
+                             _mm256_extractf128_si256(tr2_6, 1));
+            _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32),
+                             _mm256_extractf128_si256(tr2_7, 1));
+            // Process next 8x8
+            output_currStep += 8;
+            output_nextStep += 8;
+          }
+          if (1 == pass) {
+            store_coeff(&tr2_0, curr_out + 0 * 32, next_out + 0 * 32);
+            store_coeff(&tr2_1, curr_out + 1 * 32, next_out + 1 * 32);
+            store_coeff(&tr2_2, curr_out + 2 * 32, next_out + 2 * 32);
+            store_coeff(&tr2_3, curr_out + 3 * 32, next_out + 3 * 32);
+            store_coeff(&tr2_4, curr_out + 4 * 32, next_out + 4 * 32);
+            store_coeff(&tr2_5, curr_out + 5 * 32, next_out + 5 * 32);
+            store_coeff(&tr2_6, curr_out + 6 * 32, next_out + 6 * 32);
+            store_coeff(&tr2_7, curr_out + 7 * 32, next_out + 7 * 32);
+            curr_out += 8;
+            next_out += 8;
+          }
         }
       }
     }
   }
+  _mm256_zeroupper();
 }  // NOLINT

diff --git a/aom_dsp/x86/fwd_txfm_avx2.h b/aom_dsp/x86/fwd_txfm_avx2.h
new file mode 100644
index 0000000..2c3cfc8
--- /dev/null
+++ b/aom_dsp/x86/fwd_txfm_avx2.h

@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_FWD_TXFM_AVX2_H
+#define AOM_DSP_X86_FWD_TXFM_AVX2_H
+
+#include "./aom_config.h"
+
+static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
+
+  __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
+  __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
+
+  __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+  __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+
+  _mm256_storeu_si256((__m256i *)out, y0);
+  _mm256_storeu_si256((__m256i *)(out + 8), y1);
+#else
+  _mm256_storeu_si256((__m256i *)out, *coeff);
+#endif
+}
+
+#endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H

diff --git a/aom_dsp/x86/fwd_txfm_sse2.c b/aom_dsp/x86/fwd_txfm_sse2.c
index 4dcc67c..e398548 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/aom_dsp/x86/fwd_txfm_sse2.c

@@ -12,6 +12,7 @@
 #include <emmintrin.h>  // SSE2
 
 #include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 
@@ -41,7 +42,7 @@
 
   in1 = _mm_add_epi32(tmp, in0);
   in0 = _mm_slli_epi32(in1, 1);
-  store_output(&in0, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
 }
 
 void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
@@ -81,7 +82,7 @@
   in0 = _mm_srli_si128(sum, 8);
 
   in1 = _mm_add_epi32(sum, in0);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
@@ -92,40 +93,39 @@
   int i;
 
   for (i = 0; i < 2; ++i) {
-    input += 8 * i;
-    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
 
     u0 = _mm_add_epi16(in0, in1);
     u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0 = _mm_add_epi16(in0, in1);
     u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
+    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0 = _mm_add_epi16(in0, in1);
     u1 = _mm_add_epi16(in2, in3);
     sum = _mm_add_epi16(sum, u0);
 
-    in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
+    in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
+    in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
+    in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
+    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
 
     sum = _mm_add_epi16(sum, u1);
     u0 = _mm_add_epi16(in0, in1);
@@ -133,6 +133,7 @@
     sum = _mm_add_epi16(sum, u0);
 
     sum = _mm_add_epi16(sum, u1);
+    input += 8 * stride;
   }
 
   u0 = _mm_setzero_si128();
@@ -150,7 +151,7 @@
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 1);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
@@ -222,7 +223,7 @@
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 3);
-  store_output(&in1, output);
+  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
 }
 
 #define DCT_HIGH_BIT_DEPTH 0

diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index 4b243ba..fe3e446 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h

@@ -12,6 +12,8 @@
 #ifndef AOM_DSP_X86_FWD_TXFM_SSE2_H_
 #define AOM_DSP_X86_FWD_TXFM_SSE2_H_
 
+#include "aom_dsp/x86/txfm_common_intrin.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -257,19 +259,6 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 }
 
-static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_AOM_HIGHBITDEPTH
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
-  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-#else
-  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-}
-
 static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
                                        const __m128i *pmultiplier,
                                        const __m128i *prounding,
@@ -365,6 +354,8 @@
   }
 }
 
+void fdct32_8col(__m128i *in0, __m128i *in1);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 6f3c470..5b2aab2 100644
--- a/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm

@@ -130,12 +130,30 @@
   psraw              m%2, 1
 %endmacro
 
+%macro STORE_OUTPUT 2 ; index, result
+%if CONFIG_AOM_HIGHBITDEPTH
+  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  pxor               m11, m11
+  pcmpgtw            m11, m%2
+  movdqa             m12, m%2
+  punpcklwd          m%2, m11
+  punpckhwd          m12, m11
+  mova               [outputq + 4*%1 +  0], m%2
+  mova               [outputq + 4*%1 + 16], m12
+%else
+  mova               [outputq + 2*%1], m%2
+%endif
+%endmacro
+
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
   mova               m8, [pd_8192]
   mova              m12, [pw_11585x2]
-  pxor              m11, m11
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -173,14 +191,14 @@
   DIVIDE_ROUND_2X   4, 5, 9, 10
   DIVIDE_ROUND_2X   6, 7, 9, 10
 
-  mova              [outputq +   0], m0
-  mova              [outputq +  16], m1
-  mova              [outputq +  32], m2
-  mova              [outputq +  48], m3
-  mova              [outputq +  64], m4
-  mova              [outputq +  80], m5
-  mova              [outputq +  96], m6
-  mova              [outputq + 112], m7
+  STORE_OUTPUT       0, 0
+  STORE_OUTPUT       8, 1
+  STORE_OUTPUT      16, 2
+  STORE_OUTPUT      24, 3
+  STORE_OUTPUT      32, 4
+  STORE_OUTPUT      40, 5
+  STORE_OUTPUT      48, 6
+  STORE_OUTPUT      56, 7
 
   RET
 %endif

diff --git a/aom_dsp/x86/highbd_subtract_sse2.c b/aom_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000..23d6630
--- /dev/null
+++ b/aom_dsp/x86/highbd_subtract_sse2.c

@@ -0,0 +1,363 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+                                    const uint16_t *src, ptrdiff_t src_stride,
+                                    const uint16_t *pred,
+                                    ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+  store_diff = (int64_t *)(diff + 4 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x4);
+  store_diff = (int64_t *)(diff + 5 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x5);
+  store_diff = (int64_t *)(diff + 6 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x6);
+  store_diff = (int64_t *)(diff + 7 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
+}
+
+static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 8;
+  src += 8;
+  pred += 8;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 16;
+  src += 16;
+  pred += 16;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 32;
+  src += 32;
+  pred += 32;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 64;
+  src += 64;
+  pred += 64;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
+                             const uint16_t *src, ptrdiff_t src_stride,
+                             const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+  SubtractWxHFuncType ret_func_ptr = NULL;
+  if (rows == 4) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x4;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x4;
+    }
+  } else if (rows == 8) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x8;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x8;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x8;
+    }
+  } else if (rows == 16) {
+    if (cols == 8) {
+      ret_func_ptr = subtract_8x16;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x16;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x16;
+    }
+  } else if (rows == 32) {
+    if (cols == 16) {
+      ret_func_ptr = subtract_16x32;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x32;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x32;
+    }
+  } else if (rows == 64) {
+    if (cols == 32) {
+      ret_func_ptr = subtract_32x64;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x64;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x64;
+    }
+  } else if (rows == 128) {
+    if (cols == 64) {
+      ret_func_ptr = subtract_64x128;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x128;
+    }
+  }
+  if (!ret_func_ptr) {
+    assert(0);
+  }
+  return ret_func_ptr;
+}
+
+void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+                                    ptrdiff_t diff_stride, const uint8_t *src8,
+                                    ptrdiff_t src_stride, const uint8_t *pred8,
+                                    ptrdiff_t pred_stride, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  SubtractWxHFuncType func;
+  (void)bd;
+
+  func = getSubtractFunc(rows, cols);
+  func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}

diff --git a/aom_dsp/x86/highbd_variance_sse2.c b/aom_dsp/x86/highbd_variance_sse2.c
index 38a6a1e..26b99b9 100644
--- a/aom_dsp/x86/highbd_variance_sse2.c
+++ b/aom_dsp/x86/highbd_variance_sse2.c

@@ -125,10 +125,8 @@
     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
   }
 
-/* clang-format off */
-HIGH_GET_VAR(16)
-HIGH_GET_VAR(8)
-/* clang-format on */
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
 
 #undef HIGH_GET_VAR
 
@@ -149,38 +147,40 @@
       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
       int ref_stride, uint32_t *sse) {                                     \
     int sum;                                                               \
+    int64_t var;                                                           \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
     highbd_10_variance_sse2(                                               \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    return *sse - (((int64_t)sum * sum) >> shift);                         \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
   }                                                                        \
                                                                            \
   uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
       const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
       int ref_stride, uint32_t *sse) {                                     \
     int sum;                                                               \
+    int64_t var;                                                           \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
     highbd_12_variance_sse2(                                               \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    return *sse - (((int64_t)sum * sum) >> shift);                         \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-/* clang-format off */
-VAR_FN(64, 64, 16, 12)
-VAR_FN(64, 32, 16, 11)
-VAR_FN(32, 64, 16, 11)
-VAR_FN(32, 32, 16, 10)
-VAR_FN(32, 16, 16, 9)
-VAR_FN(16, 32, 16, 9)
-VAR_FN(16, 16, 16, 8)
-VAR_FN(16, 8, 8, 7)
-VAR_FN(8, 16, 8, 7)
-VAR_FN(8, 8, 8, 6)
-/* clang-format on */
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
 
 #undef VAR_FN
 
@@ -258,13 +258,12 @@
       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
       const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
-#define DECLS(opt1, opt2) \
-  DECL(8, opt1);          \
-  DECL(16, opt1)
+#define DECLS(opt) \
+  DECL(8, opt);    \
+  DECL(16, opt)
 
-DECLS(sse2, sse);
-// TODO(johannkoenig): enable the ssse3 or delete
-// DECLS(ssse3, ssse3);
+DECLS(sse2);
+
 #undef DECLS
 #undef DECL
 
@@ -384,20 +383,20 @@
     return sse - ((cast se * se) >> (wlog2 + hlog2));                          \
   }
 
-#define FNS(opt1, opt2)                  \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt1, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (int64_t));
+#define FNS(opt)                        \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));
 
-FNS(sse2, sse);
+FNS(sse2);
 
 #undef FNS
 #undef FN
@@ -557,11 +556,11 @@
 #undef FNS
 #undef FN
 
-void aom_highbd_upsampled_pred_sse2(uint16_t *pred, int width, int height,
-                                    const uint8_t *ref8, const int ref_stride) {
-  const int stride = ref_stride << 3;
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
+                                    const uint8_t *ref8, int ref_stride) {
   int i, j;
+  int stride = ref_stride << 3;
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 
   if (width >= 8) {
     // read 8 points at one time
@@ -585,8 +584,8 @@
         t2 = _mm_unpacklo_epi32(t2, t3);
         t0 = _mm_unpacklo_epi64(t0, t2);
 
-        _mm_storeu_si128((__m128i *)(pred), t0);
-        pred += 8;
+        _mm_storeu_si128((__m128i *)(comp_pred), t0);
+        comp_pred += 8;
         ref += 64;  // 8 * 8;
       }
       ref += stride - (width << 3);
@@ -605,8 +604,8 @@
         t1 = _mm_unpacklo_epi16(s2, s3);
         t0 = _mm_unpacklo_epi32(t0, t1);
 
-        _mm_storel_epi64((__m128i *)(pred), t0);
-        pred += 4;
+        _mm_storel_epi64((__m128i *)(comp_pred), t0);
+        comp_pred += 4;
         ref += 4 * 8;
       }
       ref += stride - (width << 3);
@@ -617,10 +616,10 @@
 void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
                                              const uint8_t *pred8, int width,
                                              int height, const uint8_t *ref8,
-                                             const int ref_stride) {
+                                             int ref_stride) {
   const __m128i one = _mm_set1_epi16(1);
-  const int stride = ref_stride << 3;
   int i, j;
+  int stride = ref_stride << 3;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
 

diff --git a/aom_dsp/x86/highbd_variance_sse4.c b/aom_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000..298cbf7
--- /dev/null
+++ b/aom_dsp/x86/highbd_variance_sse4.c

@@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return diff >= 0 ? (uint32_t)diff : 0;
+}
+
+// Sub-pixel
+uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+                           4);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+                           4);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
+                           4);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}

diff --git a/aom_dsp/x86/inv_txfm_sse2.c b/aom_dsp/x86/inv_txfm_sse2.c
index 7d1fc52..2217a46 100644
--- a/aom_dsp/x86/inv_txfm_sse2.c
+++ b/aom_dsp/x86/inv_txfm_sse2.c

@@ -159,8 +159,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 4);
 
   dc_value = _mm_set1_epi16(a);
@@ -171,14 +171,6 @@
   RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
 }
 
-static INLINE void transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
 void aom_idct4_sse2(__m128i *in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
@@ -187,7 +179,7 @@
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
 
-  transpose_4x4(in);
+  array_transpose_4x4(in);
   // stage 1
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
@@ -225,7 +217,7 @@
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8], in7;
 
-  transpose_4x4(in);
+  array_transpose_4x4(in);
   in7 = _mm_srli_si128(in[1], 8);
   in7 = _mm_add_epi16(in7, in[0]);
   in7 = _mm_sub_epi16(in7, in[1]);
@@ -525,8 +517,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 5);
 
   dc_value = _mm_set1_epi16(a);
@@ -1295,34 +1287,20 @@
   const __m128i zero = _mm_setzero_si128();
   int a, i;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);
 
-  for (i = 0; i < 2; ++i) {
-    RECON_AND_STORE(dest + 0 * stride, dc_value);
-    RECON_AND_STORE(dest + 1 * stride, dc_value);
-    RECON_AND_STORE(dest + 2 * stride, dc_value);
-    RECON_AND_STORE(dest + 3 * stride, dc_value);
-    RECON_AND_STORE(dest + 4 * stride, dc_value);
-    RECON_AND_STORE(dest + 5 * stride, dc_value);
-    RECON_AND_STORE(dest + 6 * stride, dc_value);
-    RECON_AND_STORE(dest + 7 * stride, dc_value);
-    RECON_AND_STORE(dest + 8 * stride, dc_value);
-    RECON_AND_STORE(dest + 9 * stride, dc_value);
-    RECON_AND_STORE(dest + 10 * stride, dc_value);
-    RECON_AND_STORE(dest + 11 * stride, dc_value);
-    RECON_AND_STORE(dest + 12 * stride, dc_value);
-    RECON_AND_STORE(dest + 13 * stride, dc_value);
-    RECON_AND_STORE(dest + 14 * stride, dc_value);
-    RECON_AND_STORE(dest + 15 * stride, dc_value);
-    dest += 8;
+  for (i = 0; i < 16; ++i) {
+    RECON_AND_STORE(dest + 0, dc_value);
+    RECON_AND_STORE(dest + 8, dc_value);
+    dest += stride;
   }
 }
 
-static void iadst16_8col(__m128i *in) {
+void iadst16_8col(__m128i *in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1792,7 +1770,7 @@
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-static void idct16_8col(__m128i *in) {
+void idct16_8col(__m128i *in) {
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -2687,28 +2665,28 @@
     stp1_31 = stp2_31;                                                         \
   }
 
-#define IDCT32                                                                 \
+#define IDCT32(in0, in1)                                                       \
   /* Stage1 */                                                                 \
   {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
+    const __m128i lo_1_31 = _mm_unpacklo_epi16((in0)[1], (in1)[15]);           \
+    const __m128i hi_1_31 = _mm_unpackhi_epi16((in0)[1], (in1)[15]);           \
+    const __m128i lo_17_15 = _mm_unpacklo_epi16((in1)[1], (in0)[15]);          \
+    const __m128i hi_17_15 = _mm_unpackhi_epi16((in1)[1], (in0)[15]);          \
                                                                                \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
+    const __m128i lo_9_23 = _mm_unpacklo_epi16((in0)[9], (in1)[7]);            \
+    const __m128i hi_9_23 = _mm_unpackhi_epi16((in0)[9], (in1)[7]);            \
+    const __m128i lo_25_7 = _mm_unpacklo_epi16((in1)[9], (in0)[7]);            \
+    const __m128i hi_25_7 = _mm_unpackhi_epi16((in1)[9], (in0)[7]);            \
                                                                                \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
+    const __m128i lo_5_27 = _mm_unpacklo_epi16((in0)[5], (in1)[11]);           \
+    const __m128i hi_5_27 = _mm_unpackhi_epi16((in0)[5], (in1)[11]);           \
+    const __m128i lo_21_11 = _mm_unpacklo_epi16((in1)[5], (in0)[11]);          \
+    const __m128i hi_21_11 = _mm_unpackhi_epi16((in1)[5], (in0)[11]);          \
                                                                                \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
+    const __m128i lo_13_19 = _mm_unpacklo_epi16((in0)[13], (in1)[3]);          \
+    const __m128i hi_13_19 = _mm_unpackhi_epi16((in0)[13], (in1)[3]);          \
+    const __m128i lo_29_3 = _mm_unpacklo_epi16((in1)[13], (in0)[3]);           \
+    const __m128i hi_29_3 = _mm_unpackhi_epi16((in1)[13], (in0)[3]);           \
                                                                                \
     MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
                            stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
@@ -2725,15 +2703,15 @@
                                                                                \
   /* Stage2 */                                                                 \
   {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
+    const __m128i lo_2_30 = _mm_unpacklo_epi16((in0)[2], (in1)[14]);           \
+    const __m128i hi_2_30 = _mm_unpackhi_epi16((in0)[2], (in1)[14]);           \
+    const __m128i lo_18_14 = _mm_unpacklo_epi16((in1)[2], (in0)[14]);          \
+    const __m128i hi_18_14 = _mm_unpackhi_epi16((in1)[2], (in0)[14]);          \
                                                                                \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
+    const __m128i lo_10_22 = _mm_unpacklo_epi16((in0)[10], (in1)[6]);          \
+    const __m128i hi_10_22 = _mm_unpackhi_epi16((in0)[10], (in1)[6]);          \
+    const __m128i lo_26_6 = _mm_unpacklo_epi16((in1)[10], (in0)[6]);           \
+    const __m128i hi_26_6 = _mm_unpackhi_epi16((in1)[10], (in0)[6]);           \
                                                                                \
     MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
                            stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
@@ -2765,10 +2743,10 @@
                                                                                \
   /* Stage3 */                                                                 \
   {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
+    const __m128i lo_4_28 = _mm_unpacklo_epi16((in0)[4], (in1)[12]);           \
+    const __m128i hi_4_28 = _mm_unpackhi_epi16((in0)[4], (in1)[12]);           \
+    const __m128i lo_20_12 = _mm_unpacklo_epi16((in1)[4], (in0)[12]);          \
+    const __m128i hi_20_12 = _mm_unpackhi_epi16((in1)[4], (in0)[12]);          \
                                                                                \
     const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
     const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
@@ -2812,10 +2790,10 @@
                                                                                \
   /* Stage4 */                                                                 \
   {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
+    const __m128i lo_0_16 = _mm_unpacklo_epi16((in0)[0], (in1)[0]);            \
+    const __m128i hi_0_16 = _mm_unpackhi_epi16((in0)[0], (in1)[0]);            \
+    const __m128i lo_8_24 = _mm_unpacklo_epi16((in0)[8], (in1)[8]);            \
+    const __m128i hi_8_24 = _mm_unpackhi_epi16((in0)[8], (in1)[8]);            \
                                                                                \
     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
@@ -3356,7 +3334,7 @@
     array_transpose_8x8(in + 16, in + 16);
     array_transpose_8x8(in + 24, in + 24);
 
-    IDCT32
+    IDCT32(in, in + 16)
 
     // 1_D: Store 32 intermediate results for each 8x32 block.
     col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3402,7 +3380,7 @@
     array_transpose_8x8(col + j + 64, in + 16);
     array_transpose_8x8(col + j + 96, in + 24);
 
-    IDCT32
+    IDCT32(in, in + 16)
 
     // 2_D: Calculate the results and store them to destination.
     in[0] = _mm_add_epi16(stp1_0, stp1_31);
@@ -3455,8 +3433,8 @@
   const __m128i zero = _mm_setzero_si128();
   int a, j;
 
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
+  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 6);
 
   dc_value = _mm_set1_epi16(a);
@@ -3469,6 +3447,107 @@
   }
 }
 
+// Apply a 32-element IDCT to 8 columns. This does not do any transposition
+// of its input - the caller is expected to have done that.
+// The input buffers are the top and bottom halves of an 8x32 block.
+void idct32_8col(__m128i *in0, __m128i *in1) {
+  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // idct constants for each stage
+  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
+  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  IDCT32(in0, in1)
+
+  // 2_D: Calculate the results and store them to destination.
+  in0[0] = _mm_add_epi16(stp1_0, stp1_31);
+  in0[1] = _mm_add_epi16(stp1_1, stp1_30);
+  in0[2] = _mm_add_epi16(stp1_2, stp1_29);
+  in0[3] = _mm_add_epi16(stp1_3, stp1_28);
+  in0[4] = _mm_add_epi16(stp1_4, stp1_27);
+  in0[5] = _mm_add_epi16(stp1_5, stp1_26);
+  in0[6] = _mm_add_epi16(stp1_6, stp1_25);
+  in0[7] = _mm_add_epi16(stp1_7, stp1_24);
+  in0[8] = _mm_add_epi16(stp1_8, stp1_23);
+  in0[9] = _mm_add_epi16(stp1_9, stp1_22);
+  in0[10] = _mm_add_epi16(stp1_10, stp1_21);
+  in0[11] = _mm_add_epi16(stp1_11, stp1_20);
+  in0[12] = _mm_add_epi16(stp1_12, stp1_19);
+  in0[13] = _mm_add_epi16(stp1_13, stp1_18);
+  in0[14] = _mm_add_epi16(stp1_14, stp1_17);
+  in0[15] = _mm_add_epi16(stp1_15, stp1_16);
+  in1[0] = _mm_sub_epi16(stp1_15, stp1_16);
+  in1[1] = _mm_sub_epi16(stp1_14, stp1_17);
+  in1[2] = _mm_sub_epi16(stp1_13, stp1_18);
+  in1[3] = _mm_sub_epi16(stp1_12, stp1_19);
+  in1[4] = _mm_sub_epi16(stp1_11, stp1_20);
+  in1[5] = _mm_sub_epi16(stp1_10, stp1_21);
+  in1[6] = _mm_sub_epi16(stp1_9, stp1_22);
+  in1[7] = _mm_sub_epi16(stp1_8, stp1_23);
+  in1[8] = _mm_sub_epi16(stp1_7, stp1_24);
+  in1[9] = _mm_sub_epi16(stp1_6, stp1_25);
+  in1[10] = _mm_sub_epi16(stp1_5, stp1_26);
+  in1[11] = _mm_sub_epi16(stp1_4, stp1_27);
+  in1[12] = _mm_sub_epi16(stp1_3, stp1_28);
+  in1[13] = _mm_sub_epi16(stp1_2, stp1_29);
+  in1[14] = _mm_sub_epi16(stp1_1, stp1_30);
+  in1[15] = _mm_sub_epi16(stp1_0, stp1_31);
+}
+
 #if CONFIG_AOM_HIGHBITDEPTH
 static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
   __m128i ubounded, retval;
@@ -3528,7 +3607,7 @@
     test = _mm_movemask_epi8(temp_mm);
 
     if (test) {
-      transpose_4x4(inptr);
+      array_transpose_4x4(inptr);
       sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
       sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
       inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);

diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h
index 56d83ec..4ebb34d 100644
--- a/aom_dsp/x86/inv_txfm_sse2.h
+++ b/aom_dsp/x86/inv_txfm_sse2.h

@@ -19,6 +19,14 @@
 #include "aom_dsp/x86/txfm_common_sse2.h"
 
 // perform 8x8 transpose
+static INLINE void array_transpose_4x4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
   const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
@@ -187,11 +195,14 @@
   RECON_AND_STORE(dest + 15 * stride, in[15]);
 }
 
+void iadst16_8col(__m128i *in);
+void idct16_8col(__m128i *in);
 void aom_idct4_sse2(__m128i *in);
 void aom_idct8_sse2(__m128i *in);
 void aom_idct16_sse2(__m128i *in0, __m128i *in1);
 void aom_iadst4_sse2(__m128i *in);
 void aom_iadst8_sse2(__m128i *in);
 void aom_iadst16_sse2(__m128i *in0, __m128i *in1);
+void idct32_8col(__m128i *in0, __m128i *in1);
 
 #endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_

diff --git a/aom_dsp/x86/loopfilter_avx2.c b/aom_dsp/x86/loopfilter_avx2.c
index 1098e17..bf8150e 100644
--- a/aom_dsp/x86/loopfilter_avx2.c
+++ b/aom_dsp/x86/loopfilter_avx2.c

@@ -911,4 +911,5 @@
     q6 = _mm_or_si128(flat2_q6, q6);
     _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
   }
+  _mm256_zeroupper();
 }

diff --git a/aom_dsp/x86/masked_sad_intrin_ssse3.c b/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 0000000..44d5011
--- /dev/null
+++ b/aom_dsp/x86/masked_sad_intrin_ssse3.c

@@ -0,0 +1,333 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "aom_ports/mem.h"
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+
+static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
+  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
+  return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
+  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t *)ptr);
+  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride));
+  __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
+  temp1 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 2));
+  temp2 = _mm_cvtsi32_si128(*(const uint32_t *)(ptr + stride * 3));
+  temp1 = _mm_unpacklo_epi32(temp1, temp2);
+  return _mm_unpacklo_epi64(temp3, temp1);
+}
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height);
+
+static INLINE unsigned int masked_sad8xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+static INLINE unsigned int masked_sad4xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+#define MASKSADMXN_SSSE3(m, n)                                                 \
+  unsigned int aom_masked_sad##m##x##n##_ssse3(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *msk, int msk_stride) {                                    \
+    return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
+                            m, n);                                             \
+  }
+
+#if CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
+                               msk_stride, n);                                \
+  }
+
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,         \
+                               msk_stride, n);                                \
+  }
+
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 16
+// Assumes values in m are <=64
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height) {
+  int y, x;
+  __m128i a, b, m, temp1, temp2;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // For each row
+  for (y = 0; y < height; y++) {
+    // Covering the full width
+    for (x = 0; x < width; x += 16) {
+      // Load a, b, m in xmm registers
+      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+      m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
+
+      // Calculate the difference between a & b
+      temp1 = _mm_subs_epu8(a, b);
+      temp2 = _mm_subs_epu8(b, a);
+      temp1 = _mm_or_si128(temp1, temp2);
+
+      // Multiply by m and add together
+      temp2 = _mm_maddubs_epi16(temp1, m);
+      // Pad out row result to 32 bit integers & add to running total
+      res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
+    }
+    // Move onto the next row
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad8xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2, row_res;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // Add the masked SAD for 2 rows at a time
+  for (y = 0; y < height; y += 2) {
+    // Load a, b, m in xmm registers
+    a = width8_load_2rows(a_ptr, a_stride);
+    b = width8_load_2rows(b_ptr, b_stride);
+    m = width8_load_2rows(m_ptr, m_stride);
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu8(a, b);
+    temp2 = _mm_subs_epu8(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    row_res = _mm_maddubs_epi16(temp1, m);
+
+    // Pad out row result to 32 bit integers & add to running total
+    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad4xh_ssse3(
+    const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2, row_res;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // Add the masked SAD for 4 rows at a time
+  for (y = 0; y < height; y += 4) {
+    // Load a, b, m in xmm registers
+    a = width4_load_4rows(a_ptr, a_stride);
+    b = width4_load_4rows(b_ptr, b_stride);
+    m = width4_load_4rows(m_ptr, m_stride);
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu8(a, b);
+    temp2 = _mm_subs_epu8(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    row_res = _mm_maddubs_epi16(temp1, m);
+
+    // Pad out row result to 32 bit integers & add to running total
+    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 4;
+    b_ptr += b_stride * 4;
+    m_ptr += m_stride * 4;
+  }
+  // Pad out row result to 32 bit integers & add to running total
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
+                                               int stride) {
+  __m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
+  return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int width, int height);
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk,     \
+                                   msk_stride, m, n);                         \
+  }
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n)                                            \
+  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *msk, int msk_stride) {                                   \
+    return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk,  \
+                                      msk_stride, n);                         \
+  }
+
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 8
+// Assumes values in m are <=64
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int width, int height) {
+  int y, x;
+  __m128i a, b, m, temp1, temp2;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+  __m128i res = _mm_setzero_si128();
+  // For each row
+  for (y = 0; y < height; y++) {
+    // Covering the full width
+    for (x = 0; x < width; x += 8) {
+      // Load a, b, m in xmm registers
+      a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
+      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
+                            _mm_setzero_si128());
+
+      // Calculate the difference between a & b
+      temp1 = _mm_subs_epu16(a, b);
+      temp2 = _mm_subs_epu16(b, a);
+      temp1 = _mm_or_si128(temp1, temp2);
+
+      // Add result of multiplying by m and add pairs together to running total
+      res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+    }
+    // Move onto the next row
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(
+    const uint8_t *a8_ptr, int a_stride, const uint8_t *b8_ptr, int b_stride,
+    const uint8_t *m_ptr, int m_stride, int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+  __m128i res = _mm_setzero_si128();
+  // Add the masked SAD for 2 rows at a time
+  for (y = 0; y < height; y += 2) {
+    // Load a, b, m in xmm registers
+    a = highbd_width4_load_2rows(a_ptr, a_stride);
+    b = highbd_width4_load_2rows(b_ptr, b_stride);
+    temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
+    temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
+    m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
+                          _mm_setzero_si128());
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu16(a, b);
+    temp2 = _mm_subs_epu16(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH

diff --git a/aom_dsp/x86/masked_variance_intrin_ssse3.c b/aom_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 0000000..2a838a6
--- /dev/null
+++ b/aom_dsp/x86/masked_variance_intrin_ssse3.c

@@ -0,0 +1,1945 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "./aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_filter.h"
+
+// Half pixel shift
+#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS / 2)
+
+/*****************************************************************************
+ * Horizontal additions
+ *****************************************************************************/
+
+static INLINE int32_t hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
+                                            uint32_t *sse, const int w,
+                                            const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute the variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+/*****************************************************************************
+ * n*16 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variancewxh_ssse3(
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+  int ii, jj;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((w % 16) == 0);
+
+  for (ii = 0; ii < h; ii++) {
+    for (jj = 0; jj < w; jj += 16) {
+      // Load inputs - 8 bits
+      const __m128i v_a_b = _mm_loadu_si128((const __m128i *)(a + jj));
+      const __m128i v_b_b = _mm_loadu_si128((const __m128i *)(b + jj));
+      const __m128i v_m_b = _mm_loadu_si128((const __m128i *)(m + jj));
+
+      // Unpack to 16 bits - still containing max 8 bits
+      const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+      const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+      const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+      const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+      const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
+      const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
+
+      // Difference: [-255, 255]
+      const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
+      const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
+
+      // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+      const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
+      const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+      const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
+      const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+
+      // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+      const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
+      const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
+
+      // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
+      const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
+
+      // Unpack Squared error to 64 bits
+      const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+      const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+      // Accumulate
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+    }
+
+    // Move on to next row
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+
+#define MASKED_VARWXH(W, H)                                                   \
+  unsigned int aom_masked_variance##W##x##H##_ssse3(                          \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
+    return masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, m_stride, W, \
+                                    H, sse);                                  \
+  }
+
+MASKED_VARWXH(16, 8)
+MASKED_VARWXH(16, 16)
+MASKED_VARWXH(16, 32)
+MASKED_VARWXH(32, 16)
+MASKED_VARWXH(32, 32)
+MASKED_VARWXH(32, 64)
+MASKED_VARWXH(64, 32)
+MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+MASKED_VARWXH(64, 128)
+MASKED_VARWXH(128, 64)
+MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+/*****************************************************************************
+ * 8 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variance8xh_ssse3(
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int h, unsigned int *sse) {
+  int ii;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  for (ii = 0; ii < h; ii++) {
+    // Load inputs - 8 bits
+    const __m128i v_a_b = _mm_loadl_epi64((const __m128i *)a);
+    const __m128i v_b_b = _mm_loadl_epi64((const __m128i *)b);
+    const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)m);
+
+    // Unpack to 16 bits - still containing max 8 bits
+    const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+    const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+    // Difference: [-255, 255]
+    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+    // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+    const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
+    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+    // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+    const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
+
+    // Unpack Squared error to 64 bits
+    const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+    const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+    // Accumulate
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+
+    // Move on to next row
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+#define MASKED_VAR8XH(H)                                                      \
+  unsigned int aom_masked_variance8x##H##_ssse3(                              \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
+    return masked_variance8xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
+                                    sse);                                     \
+  }
+
+MASKED_VAR8XH(4)
+MASKED_VAR8XH(8)
+MASKED_VAR8XH(16)
+
+/*****************************************************************************
+ * 4 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variance4xh_ssse3(
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int h, unsigned int *sse) {
+  int ii;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((h % 2) == 0);
+
+  for (ii = 0; ii < h / 2; ii++) {
+    // Load 2 input rows - 8 bits
+    const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t *)a);
+    const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t *)b);
+    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
+    const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t *)(a + a_stride));
+    const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t *)(b + b_stride));
+    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
+
+    // Interleave 2 rows into a single register
+    const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b);
+    const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b);
+    const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
+
+    // Unpack to 16 bits - still containing max 8 bits
+    const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+    const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+    // Difference: [-255, 255]
+    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+    // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+    const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
+    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+    // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+    const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
+
+    // Unpack Squared error to 64 bits
+    const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+    const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+    // Accumulate
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+
+    // Move on to next 2 row
+    a += a_stride * 2;
+    b += b_stride * 2;
+    m += m_stride * 2;
+  }
+
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+#define MASKED_VAR4XH(H)                                                      \
+  unsigned int aom_masked_variance4x##H##_ssse3(                              \
+      const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,         \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                    \
+    return masked_variance4xh_ssse3(a, a_stride, b, b_stride, m, m_stride, H, \
+                                    sse);                                     \
+  }
+
+MASKED_VAR4XH(4)
+MASKED_VAR4XH(8)
+
+#if CONFIG_AOM_HIGHBITDEPTH
+
+// Main calculation for n*8 wide blocks
+static INLINE void highbd_masked_variance64_ssse3(
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, int64_t *sum, uint64_t *sse) {
+  int ii, jj;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((w % 8) == 0);
+
+  for (ii = 0; ii < h; ii++) {
+    for (jj = 0; jj < w; jj += 8) {
+      // Load inputs - 8 bits
+      const __m128i v_a_w = _mm_loadu_si128((const __m128i *)(a + jj));
+      const __m128i v_b_w = _mm_loadu_si128((const __m128i *)(b + jj));
+      const __m128i v_m_b = _mm_loadl_epi64((const __m128i *)(m + jj));
+
+      // Unpack m to 16 bits - still containing max 8 bits
+      const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+      // Difference: [-4095, 4095]
+      const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+      // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits
+      const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+      // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+      const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+      const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+      const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+      const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+      const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+      const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+      const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+      // Square and sum the errors -> 36bits * 4 = 38bits
+      __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+      v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+      v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+      v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+      v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+      v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+      v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+      v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+      v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+      v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+      // Accumulate
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
+    }
+
+    // Move on to next row
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  // Horizontal sum
+  *sum = hsum_epi32_si64(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
+
+  // Round
+  *sum = (*sum >= 0) ? *sum : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+// Main calculation for 4 wide blocks
+static INLINE void highbd_masked_variance64_4wide_ssse3(
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int h, int64_t *sum, uint64_t *sse) {
+  int ii;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((h % 2) == 0);
+
+  for (ii = 0; ii < h / 2; ii++) {
+    // Load 2 input rows - 8 bits
+    const __m128i v_a0_w = _mm_loadl_epi64((const __m128i *)a);
+    const __m128i v_b0_w = _mm_loadl_epi64((const __m128i *)b);
+    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t *)m);
+    const __m128i v_a1_w = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+    const __m128i v_b1_w = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t *)(m + m_stride));
+
+    // Interleave 2 rows into a single register
+    const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w);
+    const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w);
+    const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
+
+    // Unpack to 16 bits - still containing max 8 bits
+    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+    // Difference: [-4095, 4095]
+    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+    // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit)
+    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+    // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+    const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+    const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+    const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+    const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+    const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+    const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+    const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+    // Square and sum the errors -> 36bits * 4 = 38bits
+    __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+    v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+    v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+    v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+    v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+    v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+    v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+    v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+    v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+    v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+    // Accumulate
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
+
+    // Move on to next row
+    a += a_stride * 2;
+    b += b_stride * 2;
+    m += m_stride * 2;
+  }
+
+  // Horizontal sum
+  *sum = hsum_epi32_si32(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
+
+  // Round
+  *sum = (*sum >= 0) ? *sum : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+static INLINE unsigned int highbd_masked_variancewxh_ssse3(
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+  uint64_t sse64;
+  int64_t sum64;
+
+  if (w == 4)
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+                                         h, &sum64, &sse64);
+  else
+    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+                                   &sum64, &sse64);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute and return variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+  uint64_t sse64;
+  int64_t sum64;
+
+  if (w == 4)
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+                                         h, &sum64, &sse64);
+  else
+    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+                                   &sum64, &sse64);
+
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute and return variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
+    const uint16_t *a, int a_stride, const uint16_t *b, int b_stride,
+    const uint8_t *m, int m_stride, int w, int h, unsigned int *sse) {
+  uint64_t sse64;
+  int64_t sum64;
+
+  if (w == 4)
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b, b_stride, m, m_stride,
+                                         h, &sum64, &sse64);
+  else
+    highbd_masked_variance64_ssse3(a, a_stride, b, b_stride, m, m_stride, w, h,
+                                   &sum64, &sse64);
+
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute and return variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+#define HIGHBD_MASKED_VARWXH(W, H)                                         \
+  unsigned int aom_highbd_masked_variance##W##x##H##_ssse3(                \
+      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
+    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
+    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
+    return highbd_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m,    \
+                                           m_stride, W, H, sse);           \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_10_masked_variance##W##x##H##_ssse3(             \
+      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
+    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
+    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
+    return highbd_10_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
+                                              m_stride, W, H, sse);        \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_12_masked_variance##W##x##H##_ssse3(             \
+      const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride,    \
+      const uint8_t *m, int m_stride, unsigned int *sse) {                 \
+    uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                 \
+    uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                 \
+    return highbd_12_masked_variancewxh_ssse3(a, a_stride, b, b_stride, m, \
+                                              m_stride, W, H, sse);        \
+  }
+
+HIGHBD_MASKED_VARWXH(4, 4)
+HIGHBD_MASKED_VARWXH(4, 8)
+HIGHBD_MASKED_VARWXH(8, 4)
+HIGHBD_MASKED_VARWXH(8, 8)
+HIGHBD_MASKED_VARWXH(8, 16)
+HIGHBD_MASKED_VARWXH(16, 8)
+HIGHBD_MASKED_VARWXH(16, 16)
+HIGHBD_MASKED_VARWXH(16, 32)
+HIGHBD_MASKED_VARWXH(32, 16)
+HIGHBD_MASKED_VARWXH(32, 32)
+HIGHBD_MASKED_VARWXH(32, 64)
+HIGHBD_MASKED_VARWXH(64, 32)
+HIGHBD_MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKED_VARWXH(64, 128)
+HIGHBD_MASKED_VARWXH(128, 64)
+HIGHBD_MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Sub pixel versions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
+                               __m128i v_filter_b);
+
+static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
+                                       const __m128i v_filter_b) {
+  (void)v_filter_b;
+  return _mm_avg_epu8(v_a_b, v_b_b);
+}
+
+static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b,
+                                   const __m128i v_filter_b) {
+  const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
+  __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b);
+  __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b);
+  __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b);
+  __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b);
+  __m128i v_res_lo_w =
+      _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
+  __m128i v_res_hi_w =
+      _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w), FILTER_BITS);
+  return _mm_packus_epi16(v_res_lo_w, v_res_hi_w);
+}
+
+// Apply the filter to the contents of the lower half of a and b
+static INLINE void apply_filter_lo(const __m128i v_a_lo_b,
+                                   const __m128i v_b_lo_b,
+                                   const __m128i v_filter_b, __m128i *v_res_w) {
+  const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
+  __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b);
+  __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b);
+  *v_res_w =
+      _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w), FILTER_BITS);
+}
+
+static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b,
+                        const __m128i v_m_b, __m128i *v_sum_d,
+                        __m128i *v_sse_q) {
+  const __m128i v_zero = _mm_setzero_si128();
+  // Unpack to 16 bits - still containing max 8 bits
+  const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+  const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+  const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+  const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+  const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
+  const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
+
+  // Difference: [-255, 255]
+  const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
+  const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
+
+  // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+  const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
+  const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+  const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
+  const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+
+  // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+  const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
+  const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
+
+  // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
+  const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
+
+  // Unpack Squared error to 64 bits
+  const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+  const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+  // Accumulate
+  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d);
+  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d);
+  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q);
+  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q);
+}
+
+// Functions for width (W) >= 16
+unsigned int aom_masked_subpel_varWxH_xzero(const uint8_t *src, int src_stride,
+                                            int yoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int w, int h,
+                                            filter_fn_t filter_fn) {
+  int i, j;
+  __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_b = _mm_set1_epi16(
+      (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 16) {
+    // Load the first row ready
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row apply the filter
+      v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
+      v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row apply the filter
+      v_src0_b = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
+      v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j + msk_stride));
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int aom_masked_subpel_varWxH_yzero(const uint8_t *src, int src_stride,
+                                            int xoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int w, int h,
+                                            filter_fn_t filter_fn) {
+  int i, j;
+  __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_b = _mm_set1_epi16(
+      (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j += 16) {
+      // Load this row and one below & apply the filter to them
+      v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+      v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
+      v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
+
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    }
+    src += src_stride;
+    dst += dst_stride;
+    msk += msk_stride;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int aom_masked_subpel_varWxH_xnonzero_ynonzero(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int w, int h, filter_fn_t xfilter_fn,
+    filter_fn_t yfilter_fn) {
+  int i, j;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b;
+  __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filterx_b = _mm_set1_epi16(
+      (bilinear_filters_2t[xoffset][1] << 8) + bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filtery_b = _mm_set1_epi16(
+      (bilinear_filters_2t[yoffset][1] << 8) + bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 16) {
+    // Load the first row ready
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + j));
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + j + 1));
+    v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row & apply the filter
+      v_src2_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
+      v_src3_b = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
+      v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b);
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row & apply the filter
+      v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
+      v_src1_b =
+          _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
+      v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
+      v_msk_b = _mm_loadu_si128((const __m128i *)(msk + msk_stride + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b);
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int aom_masked_subpel_var4xH_xzero(const uint8_t *src, int src_stride,
+                                            int yoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w;
+  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
+  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                      bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first row of src data ready
+  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+  for (i = 0; i < h; i += 4) {
+    // Load the rest of the source data for these rows
+    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+    v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+    v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+    v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
+    // Load the dst data
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
+    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
+    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+    // Load the mask data
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
+    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
+    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+    // Apply the y filter
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
+      v_src2_b =
+          _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+                       _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b);
+    } else {
+      v_src2_b =
+          _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+                       _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w);
+      v_src2_b =
+          _mm_or_si128(_mm_slli_si128(v_src3_b, 4),
+                       _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w);
+      v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w);
+    }
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
+unsigned int aom_masked_subpel_var8xH_xzero(const uint8_t *src, int src_stride,
+                                            int yoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b;
+  __m128i v_dst_b = _mm_setzero_si128();
+  __m128i v_msk_b = _mm_setzero_si128();
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                      bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first row of src data ready
+  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+  for (i = 0; i < h; i += 2) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      // Load the rest of the source data for these rows
+      v_src1_b = _mm_or_si128(
+          _mm_slli_si128(v_src0_b, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
+      v_src0_b = _mm_or_si128(
+          _mm_slli_si128(v_src1_b, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
+      // Apply the y filter
+      v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b);
+    } else {
+      // Load the data and apply the y filter
+      v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+      apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w);
+      v_src0_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+      apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w);
+      v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int aom_masked_subpel_var4xH_yzero(const uint8_t *src, int src_stride,
+                                            int xoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
+  __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
+  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
+  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                      bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 4) {
+    // Load the src data
+    v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+    v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+    v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
+    v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
+    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+    v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+    v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
+    v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
+    // Load the dst data
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
+    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
+    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+    // Load the mask data
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
+    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
+    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
+      v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
+      v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
+      apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w);
+      v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
+    }
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_masked_subpel_var8xH_yzero(const uint8_t *src, int src_stride,
+                                            int xoffset, const uint8_t *dst,
+                                            int dst_stride, const uint8_t *msk,
+                                            int msk_stride, unsigned int *sse,
+                                            int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w;
+  __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                      bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 2) {
+    // Load the src data
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src));
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+      v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
+      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w);
+      v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int aom_masked_subpel_var4xH_xnonzero_ynonzero(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
+  __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
+  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b;
+  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b;
+  __m128i v_xres_b[2];
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                       bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                       bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 4) {
+    // Load the src data
+    v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+    v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    v_src2_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+    v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
+    v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
+    v_src3_b = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
+    v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+    v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
+    v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
+      v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
+      v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+      apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w);
+      v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
+    }
+    // Move onto the next set of rows
+    src += src_stride * 4;
+  }
+  // Load one more row to be used in the y filter
+  v_src0_b = _mm_loadl_epi64((const __m128i *)src);
+  v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+  // Apply the x filter
+  if (xoffset == HALF_PIXEL_OFFSET) {
+    v_extra_row_b = _mm_and_si128(_mm_avg_epu8(v_src0_b, v_src0_shift_b),
+                                  _mm_setr_epi32(-1, 0, 0, 0));
+  } else {
+    apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+    v_extra_row_b =
+        _mm_and_si128(_mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()),
+                      _mm_setr_epi32(-1, 0, 0, 0));
+  }
+
+  for (i = 0; i < h; i += 4) {
+    if (h == 8 && i == 0) {
+      v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4),
+                              _mm_srli_si128(v_xres_b[1], 12));
+    } else {
+      v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4),
+                              v_extra_row_b);
+    }
+    // Apply the y filter
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b);
+    } else {
+      v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b);
+    }
+
+    // Load the dst data
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 1));
+    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t *)(dst + dst_stride * 3));
+    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+    // Load the mask data
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 1));
+    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t *)(msk + msk_stride * 3));
+    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_masked_subpel_var8xH_xnonzero_ynonzero(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b;
+  __m128i v_src0_shift_b, v_src1_shift_b;
+  __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filterx_b = _mm_set1_epi16((bilinear_filters_2t[xoffset][1] << 8) +
+                                       bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_b = _mm_set1_epi16((bilinear_filters_2t[yoffset][1] << 8) +
+                                       bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first block of src data
+  v_src0_b = _mm_loadu_si128((const __m128i *)(src));
+  v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+  v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride));
+  v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+  // Apply the x filter
+  if (xoffset == HALF_PIXEL_OFFSET) {
+    v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+    v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+    v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+  } else {
+    apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+    apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+    v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+  }
+  for (i = 0; i < h; i += 4) {
+    // Load the next block of src data
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+      v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+      v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+    }
+    // Apply the y filter to the previous block
+    v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8),
+                            _mm_slli_si128(v_xres1_b, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b);
+    } else {
+      v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+    // Load the next block of src data
+    v_src0_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+      v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+      v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+    }
+    // Apply the y filter to the previous block
+    v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8),
+                            _mm_slli_si128(v_xres0_b, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b);
+    } else {
+      v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// For W >=16
+#define MASK_SUBPIX_VAR_LARGE(W, H)                                            \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
+      unsigned int *sse) {                                                     \
+    assert(W % 16 == 0);                                                       \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0)                                                        \
+        return aom_masked_variance##W##x##H##_ssse3(                           \
+            src, src_stride, dst, dst_stride, msk, msk_stride, sse);           \
+      else if (yoffset == HALF_PIXEL_OFFSET)                                   \
+        return aom_masked_subpel_varWxH_xzero(                                 \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, apply_filter_avg);                          \
+      else                                                                     \
+        return aom_masked_subpel_varWxH_xzero(src, src_stride, yoffset, dst,   \
+                                              dst_stride, msk, msk_stride,     \
+                                              sse, W, H, apply_filter);        \
+    } else if (yoffset == 0) {                                                 \
+      if (xoffset == HALF_PIXEL_OFFSET)                                        \
+        return aom_masked_subpel_varWxH_yzero(                                 \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, apply_filter_avg);                          \
+      else                                                                     \
+        return aom_masked_subpel_varWxH_yzero(src, src_stride, xoffset, dst,   \
+                                              dst_stride, msk, msk_stride,     \
+                                              sse, W, H, apply_filter);        \
+    } else if (xoffset == HALF_PIXEL_OFFSET) {                                 \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst,        \
+            dst_stride, msk, msk_stride, sse, W, H, apply_filter_avg,          \
+            apply_filter_avg);                                                 \
+      else                                                                     \
+        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, apply_filter_avg, apply_filter);            \
+    } else {                                                                   \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, apply_filter, apply_filter_avg);            \
+      else                                                                     \
+        return aom_masked_subpel_varWxH_xnonzero_ynonzero(                     \
+            src, src_stride, xoffset, yoffset, dst, dst_stride, msk,           \
+            msk_stride, sse, W, H, apply_filter, apply_filter);                \
+    }                                                                          \
+  }
+
+// For W < 16
+#define MASK_SUBPIX_VAR_SMALL(W, H)                                            \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,  \
+      unsigned int *sse) {                                                     \
+    assert(W == 4 || W == 8);                                                  \
+    if (xoffset == 0 && yoffset == 0)                                          \
+      return aom_masked_variance##W##x##H##_ssse3(                             \
+          src, src_stride, dst, dst_stride, msk, msk_stride, sse);             \
+    else if (xoffset == 0)                                                     \
+      return aom_masked_subpel_var##W##xH_xzero(                               \
+          src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H); \
+    else if (yoffset == 0)                                                     \
+      return aom_masked_subpel_var##W##xH_yzero(                               \
+          src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H); \
+    else                                                                       \
+      return aom_masked_subpel_var##W##xH_xnonzero_ynonzero(                   \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
+          sse, H);                                                             \
+  }
+
+MASK_SUBPIX_VAR_SMALL(4, 4)
+MASK_SUBPIX_VAR_SMALL(4, 8)
+MASK_SUBPIX_VAR_SMALL(8, 4)
+MASK_SUBPIX_VAR_SMALL(8, 8)
+MASK_SUBPIX_VAR_SMALL(8, 16)
+MASK_SUBPIX_VAR_LARGE(16, 8)
+MASK_SUBPIX_VAR_LARGE(16, 16)
+MASK_SUBPIX_VAR_LARGE(16, 32)
+MASK_SUBPIX_VAR_LARGE(32, 16)
+MASK_SUBPIX_VAR_LARGE(32, 32)
+MASK_SUBPIX_VAR_LARGE(32, 64)
+MASK_SUBPIX_VAR_LARGE(64, 32)
+MASK_SUBPIX_VAR_LARGE(64, 64)
+#if CONFIG_EXT_PARTITION
+MASK_SUBPIX_VAR_LARGE(64, 128)
+MASK_SUBPIX_VAR_LARGE(128, 64)
+MASK_SUBPIX_VAR_LARGE(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
+                                             uint32_t *sse, const int w,
+                                             const int h);
+typedef unsigned int (*highbd_variance_fn_t)(const uint8_t *a8, int a_stride,
+                                             const uint8_t *b8, int b_stride,
+                                             const uint8_t *m, int m_stride,
+                                             unsigned int *sse);
+typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
+                                      __m128i v_filter_w);
+
+static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
+                                              const __m128i v_b_w,
+                                              const __m128i v_filter_w) {
+  (void)v_filter_w;
+  return _mm_avg_epu16(v_a_w, v_b_w);
+}
+
+static INLINE __m128i highbd_apply_filter(const __m128i v_a_w,
+                                          const __m128i v_b_w,
+                                          const __m128i v_filter_w) {
+  const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w);
+  __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w);
+  __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w);
+  __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w);
+  __m128i v_res_lo_d =
+      _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
+  __m128i v_res_hi_d =
+      _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d), FILTER_BITS);
+  return _mm_packs_epi32(v_res_lo_d, v_res_hi_d);
+}
+// Apply the filter to the contents of the lower half of a and b
+static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w,
+                                          const __m128i v_b_lo_w,
+                                          const __m128i v_filter_w,
+                                          __m128i *v_res_d) {
+  const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w);
+  __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w);
+  *v_res_d =
+      _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d), FILTER_BITS);
+}
+
+static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w,
+                               const __m128i v_m_b, __m128i *v_sum_d,
+                               __m128i *v_sse_q) {
+  const __m128i v_zero = _mm_setzero_si128();
+  const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+  // Difference: [-2^12, 2^12] => 13 bits (incld sign bit)
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+  // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits
+  const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+  // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+  const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+  const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+  const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+  const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+  const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+  const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+  const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+  // Square and sum the errors -> 36bits * 4 = 38bits
+  __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+  v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+  v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+  v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+  v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+  v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+  v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+  v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+  v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+  v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+  // Accumulate
+  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
+  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
+}
+
+static INLINE uint32_t highbd_10_calc_masked_variance(
+    __m128i v_sum_d, __m128i v_sse_q, uint32_t *sse, const int w, const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute the variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+static INLINE uint32_t highbd_12_calc_masked_variance(
+    __m128i v_sum_d, __m128i v_sse_q, uint32_t *sse, const int w, const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si64(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute the variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+// High bit depth functions for width (W) >= 8
+unsigned int aom_highbd_masked_subpel_varWxH_xzero(
+    const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int w, int h, highbd_filter_fn_t filter_fn,
+    highbd_calc_masked_var_t calc_var) {
+  int i, j;
+  __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_w =
+      _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                     bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 8) {
+    // Load the first row ready
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row apply the filter
+      v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride));
+      v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row apply the filter
+      v_src0_w = _mm_loadu_si128((const __m128i *)(src + j + src_stride * 2));
+      v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j + dst_stride));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j + msk_stride));
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int aom_highbd_masked_subpel_varWxH_yzero(
+    const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int w, int h, highbd_filter_fn_t filter_fn,
+    highbd_calc_masked_var_t calc_var) {
+  int i, j;
+  __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_w =
+      _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                     bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j += 8) {
+      // Load this row & apply the filter to them
+      v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+      v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
+      v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
+
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    }
+    src += src_stride;
+    dst += dst_stride;
+    msk += msk_stride;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+
+unsigned int aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(
+    const uint16_t *src, int src_stride, int xoffset, int yoffset,
+    const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn,
+    highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) {
+  int i, j;
+  __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w;
+  __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filterx_w =
+      _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                     bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filtery_w =
+      _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                     bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 8) {
+    // Load the first row ready
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + j));
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + j + 1));
+    v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row & apply the filter
+      v_src2_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j));
+      v_src3_w = _mm_loadu_si128((const __m128i *)(src + src_stride + j + 1));
+      v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w);
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row & apply the filter
+      v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j));
+      v_src1_w =
+          _mm_loadu_si128((const __m128i *)(src + src_stride * 2 + j + 1));
+      v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i *)(dst + dst_stride + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i *)(msk + msk_stride + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w);
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+
+// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
+unsigned int aom_highbd_masked_subpel_var4xH_xzero(
+    const uint16_t *src, int src_stride, int yoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int h, highbd_calc_masked_var_t calc_var) {
+  int i;
+  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w;
+  __m128i v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                                      bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first row of src data ready
+  v_src0_w = _mm_loadl_epi64((const __m128i *)src);
+  for (i = 0; i < h; i += 2) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      // Load the rest of the source data for these rows
+      v_src1_w = _mm_or_si128(
+          _mm_slli_si128(v_src0_w, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 1)));
+      v_src0_w = _mm_or_si128(
+          _mm_slli_si128(v_src1_w, 8),
+          _mm_loadl_epi64((const __m128i *)(src + src_stride * 2)));
+      // Apply the y filter
+      v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w);
+    } else {
+      // Load the data and apply the y filter
+      v_src1_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 1));
+      highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d);
+      v_src0_w = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
+      highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d);
+      v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_highbd_masked_subpel_var4xH_yzero(
+    const uint16_t *src, int src_stride, int xoffset, const uint16_t *dst,
+    int dst_stride, const uint8_t *msk, int msk_stride, unsigned int *sse,
+    int h, highbd_calc_masked_var_t calc_var) {
+  int i;
+  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d;
+  __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                                      bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 2) {
+    // Load the src data
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src));
+    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
+    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+      v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+    } else {
+      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w,
+                             &v_filtered0_d);
+      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w,
+                             &v_filtered1_d);
+      v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
+    const uint16_t *src, int src_stride, int xoffset, int yoffset,
+    const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+    unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+  int i;
+  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b;
+  __m128i v_src0_shift_w, v_src1_shift_w;
+  __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filterx_w = _mm_set1_epi32((bilinear_filters_2t[xoffset][1] << 16) +
+                                       bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_w = _mm_set1_epi32((bilinear_filters_2t[yoffset][1] << 16) +
+                                       bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first block of src data
+  v_src0_w = _mm_loadu_si128((const __m128i *)(src));
+  v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+  v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride));
+  v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+  // Apply the x filter
+  if (xoffset == HALF_PIXEL_OFFSET) {
+    v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+    v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+    v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+  } else {
+    highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+                           &v_filtered0_d);
+    highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+                           &v_filtered1_d);
+    v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+  }
+  for (i = 0; i < h; i += 4) {
+    // Load the next block of src data
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 2));
+    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 3));
+    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+      v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+    } else {
+      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+                             &v_filtered0_d);
+      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+                             &v_filtered1_d);
+      v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+    }
+    // Apply the y filter to the previous block
+    v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8),
+                            _mm_slli_si128(v_xres1_w, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w);
+    } else {
+      v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+    // Load the next block of src data
+    v_src0_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 4));
+    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+    v_src1_w = _mm_loadu_si128((const __m128i *)(src + src_stride * 5));
+    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+      v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+    } else {
+      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+                             &v_filtered0_d);
+      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+                             &v_filtered1_d);
+      v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+    }
+    // Apply the y filter to the previous block
+    v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8),
+                            _mm_slli_si128(v_xres0_w, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w);
+    } else {
+      v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+        _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+// For W >=8
+#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H)                                     \
+  unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(              \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse, highbd_calc_masked_var_t calc_var,                    \
+      highbd_variance_fn_t full_variance_function) {                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    assert(W % 8 == 0);                                                        \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0)                                                        \
+        return full_variance_function(src8, src_stride, dst8, dst_stride, msk, \
+                                      msk_stride, sse);                        \
+      else if (yoffset == HALF_PIXEL_OFFSET)                                   \
+        return aom_highbd_masked_subpel_varWxH_xzero(                          \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var);         \
+      else                                                                     \
+        return aom_highbd_masked_subpel_varWxH_xzero(                          \
+            src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse,   \
+            W, H, highbd_apply_filter, calc_var);                              \
+    } else if (yoffset == 0) {                                                 \
+      if (xoffset == HALF_PIXEL_OFFSET)                                        \
+        return aom_highbd_masked_subpel_varWxH_yzero(                          \
+            src, src_stride, HALF_PIXEL_OFFSET, dst, dst_stride, msk,          \
+            msk_stride, sse, W, H, highbd_apply_filter_avg, calc_var);         \
+      else                                                                     \
+        return aom_highbd_masked_subpel_varWxH_yzero(                          \
+            src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse,   \
+            W, H, highbd_apply_filter, calc_var);                              \
+    } else if (xoffset == HALF_PIXEL_OFFSET) {                                 \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst,        \
+            dst_stride, msk, msk_stride, sse, W, H, highbd_apply_filter_avg,   \
+            highbd_apply_filter_avg, calc_var);                                \
+      else                                                                     \
+        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, highbd_apply_filter_avg,                    \
+            highbd_apply_filter, calc_var);                                    \
+    } else {                                                                   \
+      if (yoffset == HALF_PIXEL_OFFSET)                                        \
+        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk, \
+            msk_stride, sse, W, H, highbd_apply_filter,                        \
+            highbd_apply_filter_avg, calc_var);                                \
+      else                                                                     \
+        return aom_highbd_masked_subpel_varWxH_xnonzero_ynonzero(              \
+            src, src_stride, xoffset, yoffset, dst, dst_stride, msk,           \
+            msk_stride, sse, W, H, highbd_apply_filter, highbd_apply_filter,   \
+            calc_var);                                                         \
+    }                                                                          \
+  }
+
+// For W < 8
+#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H)                                     \
+  unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(              \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse, highbd_calc_masked_var_t calc_var,                    \
+      highbd_variance_fn_t full_variance_function) {                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    assert(W == 4);                                                            \
+    if (xoffset == 0 && yoffset == 0)                                          \
+      return full_variance_function(src8, src_stride, dst8, dst_stride, msk,   \
+                                    msk_stride, sse);                          \
+    else if (xoffset == 0)                                                     \
+      return aom_highbd_masked_subpel_var4xH_xzero(                            \
+          src, src_stride, yoffset, dst, dst_stride, msk, msk_stride, sse, H,  \
+          calc_var);                                                           \
+    else if (yoffset == 0)                                                     \
+      return aom_highbd_masked_subpel_var4xH_yzero(                            \
+          src, src_stride, xoffset, dst, dst_stride, msk, msk_stride, sse, H,  \
+          calc_var);                                                           \
+    else                                                                       \
+      return aom_highbd_masked_subpel_var4xH_xnonzero_ynonzero(                \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, msk, msk_stride, \
+          sse, H, calc_var);                                                   \
+  }
+
+#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H)                                  \
+  unsigned int aom_highbd_masked_sub_pixel_variance##W##x##H##_ssse3(          \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                     \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
+        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+        sse, calc_masked_variance,                                             \
+        aom_highbd_masked_variance##W##x##H##_ssse3);                          \
+  }                                                                            \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(       \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                     \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
+        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+        sse, highbd_10_calc_masked_variance,                                   \
+        aom_highbd_10_masked_variance##W##x##H##_ssse3);                       \
+  }                                                                            \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(       \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst8, int dst_stride, const uint8_t *msk, int msk_stride, \
+      unsigned int *sse) {                                                     \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                  \
+        src8, src_stride, xoffset, yoffset, dst8, dst_stride, msk, msk_stride, \
+        sse, highbd_12_calc_masked_variance,                                   \
+        aom_highbd_12_masked_variance##W##x##H##_ssse3);                       \
+  }
+
+HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4)
+HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+#endif

diff --git a/aom_dsp/x86/obmc_sad_sse4.c b/aom_dsp/x86/obmc_sad_sse4.c
index 3e12749..cf54ca8 100644
--- a/aom_dsp/x86/obmc_sad_sse4.c
+++ b/aom_dsp/x86/obmc_sad_sse4.c

@@ -118,6 +118,11 @@
     }                                                          \
   }
 
+#if CONFIG_EXT_PARTITION
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 OBMCSADWXH(64, 64)
 OBMCSADWXH(64, 32)
 OBMCSADWXH(32, 64)
@@ -236,6 +241,11 @@
     }                                                             \
   }
 
+#if CONFIG_EXT_PARTITION
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCSADWXH(64, 64)
 HBD_OBMCSADWXH(64, 32)
 HBD_OBMCSADWXH(32, 64)

diff --git a/aom_dsp/x86/obmc_variance_sse4.c b/aom_dsp/x86/obmc_variance_sse4.c
index 0612c9a..6e9117d 100644
--- a/aom_dsp/x86/obmc_variance_sse4.c
+++ b/aom_dsp/x86/obmc_variance_sse4.c

@@ -127,6 +127,11 @@
     return *sse - (((int64_t)sum * sum) / (W * H));                    \
   }
 
+#if CONFIG_EXT_PARTITION
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 OBMCVARWXH(64, 64)
 OBMCVARWXH(64, 32)
 OBMCVARWXH(32, 64)
@@ -325,6 +330,11 @@
     return *sse - (((int64_t)sum * sum) / (W * H));                        \
   }
 
+#if CONFIG_EXT_PARTITION
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
 HBD_OBMCVARWXH(64, 64)
 HBD_OBMCVARWXH(64, 32)
 HBD_OBMCVARWXH(32, 64)

diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 686ee24..0ce9bed 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c

@@ -78,6 +78,7 @@
 
     _mm_storeu_si128((__m128i *)(res), sum);
   }
+  _mm256_zeroupper();
 }
 
 void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
@@ -162,4 +163,5 @@
 
     _mm_storeu_si128((__m128i *)(res), sum);
   }
+  _mm256_zeroupper();
 }

diff --git a/aom_dsp/x86/sad4d_sse2.asm b/aom_dsp/x86/sad4d_sse2.asm
index f27015f..8f04ef2 100644
--- a/aom_dsp/x86/sad4d_sse2.asm
+++ b/aom_dsp/x86/sad4d_sse2.asm

@@ -233,6 +233,11 @@
 %endmacro
 
 INIT_XMM sse2
+%if CONFIG_EXT_PARTITION
+SADNXN4D 128, 128
+SADNXN4D 128, 64
+SADNXN4D 64,  128
+%endif
 SADNXN4D 64, 64
 SADNXN4D 64, 32
 SADNXN4D 32, 64

diff --git a/aom_dsp/x86/sad_avx2.c b/aom_dsp/x86/sad_avx2.c
index 6ce61fd..efba612 100644
--- a/aom_dsp/x86/sad_avx2.c
+++ b/aom_dsp/x86/sad_avx2.c

@@ -37,6 +37,7 @@
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
     res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
     return res;                                                               \
   }
 
@@ -69,6 +70,7 @@
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
     res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
     return res;                                                               \
   }
 
@@ -122,6 +124,7 @@
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
     res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
     return res;                                                               \
   }
 
@@ -160,6 +163,7 @@
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
     res = _mm_cvtsi128_si32(sum_sad128);                                      \
+    _mm256_zeroupper();                                                       \
     return res;                                                               \
   }
 

diff --git a/aom_dsp/x86/sad_sse2.asm b/aom_dsp/x86/sad_sse2.asm
index 3b8ed26..e45457a 100644
--- a/aom_dsp/x86/sad_sse2.asm
+++ b/aom_dsp/x86/sad_sse2.asm

@@ -47,6 +47,76 @@
 %endif ; %3 == 7
 %endmacro
 
+%if CONFIG_EXT_PARTITION
+; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
+;                                  uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+  SAD_FN 128, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*4]
+  pavgb                 m2, [second_predq+mmsize*5]
+  pavgb                 m3, [second_predq+mmsize*6]
+  pavgb                 m4, [second_predq+mmsize*7]
+  lea         second_predq, [second_predq+mmsize*8]
+%endif
+  psadbw                m1, [srcq+64]
+  psadbw                m2, [srcq+80]
+  psadbw                m3, [srcq+96]
+  psadbw                m4, [srcq+112]
+
+  add                 refq, ref_strideq
+  add                 srcq, src_strideq
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  sub              n_rowsd, 1
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128     ; sad128x128_sse2
+SAD128XN 128, 1  ; sad128x128_avg_sse2
+SAD128XN 64      ; sad128x64_sse2
+SAD128XN 64, 1   ; sad128x64_avg_sse2
+%endif
+
+
 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
@@ -85,6 +155,10 @@
 %endmacro
 
 INIT_XMM sse2
+%if CONFIG_EXT_PARTITION
+SAD64XN 128     ; sad64x128_sse2
+SAD64XN 128, 1  ; sad64x128_avg_sse2
+%endif
 SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2

diff --git a/aom_dsp/x86/subtract_sse2.asm b/aom_dsp/x86/subtract_sse2.asm
index 2fc5fae..7bd5b23 100644
--- a/aom_dsp/x86/subtract_sse2.asm
+++ b/aom_dsp/x86/subtract_sse2.asm

@@ -34,6 +34,10 @@
   je .case_16
   cmp                colsd, 32
   je .case_32
+%if CONFIG_EXT_PARTITION
+  cmp                colsd, 64
+  je .case_64
+%endif
 
 %macro loop16 6
   mova                  m0, [srcq+%1]
@@ -58,6 +62,22 @@
   mova [diffq+mmsize*1+%6], m1
 %endmacro
 
+%if CONFIG_EXT_PARTITION
+  mov             pred_str, pred_stridemp
+.loop_128:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
+  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
+  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  sub                rowsd, 1
+  jnz .loop_128
+  RET
+
+.case_64:
+%endif
   mov             pred_str, pred_stridemp
 .loop_64:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize

diff --git a/aom_dsp/x86/sum_squares_sse2.c b/aom_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 0000000..eb1d912
--- /dev/null
+++ b/aom_dsp/x86/sum_squares_sse2.c

@@ -0,0 +1,205 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "./aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
+                                                int stride) {
+  const __m128i v_val_0_w =
+      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+  const __m128i v_val_1_w =
+      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
+  const __m128i v_val_2_w =
+      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+  const __m128i v_val_3_w =
+      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
+
+  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+  const __m128i v_sum_d =
+      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+
+  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+static uint64_t
+aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
+  int r, c;
+
+  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  __m128i v_acc_q = _mm_setzero_si128();
+
+  for (r = 0; r < size; r += 8) {
+    __m128i v_acc_d = _mm_setzero_si128();
+
+    for (c = 0; c < size; c += 8) {
+      const int16_t *b = src + c;
+
+      const __m128i v_val_0_w =
+          _mm_load_si128((const __m128i *)(b + 0 * stride));
+      const __m128i v_val_1_w =
+          _mm_load_si128((const __m128i *)(b + 1 * stride));
+      const __m128i v_val_2_w =
+          _mm_load_si128((const __m128i *)(b + 2 * stride));
+      const __m128i v_val_3_w =
+          _mm_load_si128((const __m128i *)(b + 3 * stride));
+      const __m128i v_val_4_w =
+          _mm_load_si128((const __m128i *)(b + 4 * stride));
+      const __m128i v_val_5_w =
+          _mm_load_si128((const __m128i *)(b + 5 * stride));
+      const __m128i v_val_6_w =
+          _mm_load_si128((const __m128i *)(b + 6 * stride));
+      const __m128i v_val_7_w =
+          _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+    }
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+    src += 8 * stride;
+  }
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+    return tmp;
+  }
+#endif
+}
+
+uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
+  // 4 elements per row only requires half an XMM register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(size == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else {
+    // Generic case
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, size);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  __m128i v_acc0_q = _mm_setzero_si128();
+  __m128i v_acc1_q = _mm_setzero_si128();
+
+  const int16_t *const end = src + n;
+
+  assert(n % 64 == 0);
+
+  while (src < end) {
+    const __m128i v_val_0_w = xx_load_128(src);
+    const __m128i v_val_1_w = xx_load_128(src + 8);
+    const __m128i v_val_2_w = xx_load_128(src + 16);
+    const __m128i v_val_3_w = xx_load_128(src + 24);
+    const __m128i v_val_4_w = xx_load_128(src + 32);
+    const __m128i v_val_5_w = xx_load_128(src + 40);
+    const __m128i v_val_6_w = xx_load_128(src + 48);
+    const __m128i v_val_7_w = xx_load_128(src + 56);
+
+    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+    src += 64;
+  }
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, v_acc0_q);
+    return tmp;
+  }
+#endif
+}
+
+uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+  if (n % 64 == 0) {
+    return aom_sum_squares_i16_64n_sse2(src, n);
+  } else if (n > 64) {
+    int k = n & ~(64 - 1);
+    return aom_sum_squares_i16_64n_sse2(src, k) +
+           aom_sum_squares_i16_c(src + k, n - k);
+  } else {
+    return aom_sum_squares_i16_c(src, n);
+  }
+}

diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index da893a6..bef606d 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h

@@ -68,13 +68,21 @@
 }
 
 static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
   return _mm_srli_epi32(v_tmp_d, bits);
 }
 
+// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
 static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
   const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
   const __m128i v_tmp_d =
       _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);

diff --git a/aom_dsp/x86/txfm_common_avx2.h b/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 0000000..39e9b8e
--- /dev/null
+++ b/aom_dsp/x86/txfm_common_avx2.h

@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_TXFM_COMMON_AVX2_H
+#define AOM_DSP_X86_TXFM_COMMON_AVX2_H
+
+#include <immintrin.h>
+
+#include "aom_dsp/txfm_common.h"
+
+#define pair256_set_epi16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+#define pair256_set_epi32(a, b)                                                \
+  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), (int)(b), (int)(a), \
+                   (int)(b), (int)(a))
+
+static INLINE void mm256_reverse_epi16(__m256i *u) {
+  const __m256i control = _mm256_set_epi16(
+      0x0100, 0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E, 0x0100,
+      0x0302, 0x0504, 0x0706, 0x0908, 0x0B0A, 0x0D0C, 0x0F0E);
+  __m256i v = _mm256_shuffle_epi8(*u, control);
+  *u = _mm256_permute2x128_si256(v, v, 1);
+}
+
+static INLINE void mm256_transpose_16x16(__m256i *in) {
+  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
+  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
+  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
+  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
+  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
+  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
+  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
+  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
+
+  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
+  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
+  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
+  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
+  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
+  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
+  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
+  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
+
+  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
+  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
+  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
+  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
+  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
+  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
+  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
+  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
+
+  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
+  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
+  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
+  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
+  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
+  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
+  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
+  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
+
+  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
+  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
+  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
+  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
+  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
+  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
+
+  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
+  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
+  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
+  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
+  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
+  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
+  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
+  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
+
+  in[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
+  in[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
+  in[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+  in[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+  in[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+  in[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+  in[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+  in[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+  in[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+  in[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+  in[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+  in[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+  in[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+  in[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+  in[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+  in[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+
+static INLINE __m256i butter_fly(__m256i a0, __m256i a1, const __m256i cospi) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i y0 = _mm256_madd_epi16(a0, cospi);
+  __m256i y1 = _mm256_madd_epi16(a1, cospi);
+
+  y0 = _mm256_add_epi32(y0, dct_rounding);
+  y1 = _mm256_add_epi32(y1, dct_rounding);
+  y0 = _mm256_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm256_srai_epi32(y1, DCT_CONST_BITS);
+
+  return _mm256_packs_epi32(y0, y1);
+}
+
+static INLINE void txfm_scaling16_avx2(const int16_t c, __m256i *in) {
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sqrt2_epi16 = _mm256_set1_epi16(c);
+  const __m256i dct_const_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i u0, u1;
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_slli_epi16(in[i], 1);
+
+    u0 = _mm256_unpacklo_epi16(zero, in[i]);
+    u1 = _mm256_unpackhi_epi16(zero, in[i]);
+
+    u0 = _mm256_madd_epi16(u0, sqrt2_epi16);
+    u1 = _mm256_madd_epi16(u1, sqrt2_epi16);
+
+    u0 = _mm256_add_epi32(u0, dct_const_rounding);
+    u1 = _mm256_add_epi32(u1, dct_const_rounding);
+
+    u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+    u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+    in[i] = _mm256_packs_epi32(u0, u1);
+    i++;
+  }
+}
+
+#endif  // AOM_DSP_X86_TXFM_COMMON_AVX2_H

diff --git a/aom_dsp/x86/txfm_common_intrin.h b/aom_dsp/x86/txfm_common_intrin.h
new file mode 100644
index 0000000..890e048
--- /dev/null
+++ b/aom_dsp/x86/txfm_common_intrin.h

@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+#define _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+}
+
+#endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_

diff --git a/aom_dsp/x86/txfm_common_sse2.h b/aom_dsp/x86/txfm_common_sse2.h
index 027bd48..ae57361 100644
--- a/aom_dsp/x86/txfm_common_sse2.h
+++ b/aom_dsp/x86/txfm_common_sse2.h

@@ -27,4 +27,11 @@
   _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
                  (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
 
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
 #endif  // AOM_DSP_X86_TXFM_COMMON_SSE2_H_

diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index c0b78fa..18a70df 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c

@@ -8,6 +8,8 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
+#include <immintrin.h>
 #include "./aom_dsp_rtcd.h"
 
 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
@@ -43,9 +45,13 @@
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
+  unsigned int variance;
   variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
                 aom_get16x16var_avx2, 16);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+
+  variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
@@ -53,6 +59,7 @@
                                unsigned int *sse) {
   int sum;
   aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+  _mm256_zeroupper();
   return *sse;
 }
 
@@ -60,36 +67,52 @@
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
+  unsigned int variance;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
                 aom_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 9);
+
+  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
+  unsigned int variance;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
                 aom_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 10);
+
+  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
+  unsigned int variance;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
                 aom_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 12);
+
+  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
                                     const uint8_t *ref, int ref_stride,
                                     unsigned int *sse) {
   int sum;
+  unsigned int variance;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
                 aom_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 11);
+
+  variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
@@ -115,8 +138,12 @@
       aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
                                       dst + 32, dst_stride, 64, &sse2);
   const int se = se1 + se2;
+  unsigned int variance;
   *sse = sse1 + sse2;
-  return *sse - (((int64_t)se * se) >> 12);
+
+  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
@@ -126,7 +153,10 @@
                                               unsigned int *sse) {
   const int se = aom_sub_pixel_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
+
+  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_sub_pixel_avg_variance64x64_avx2(
@@ -140,10 +170,13 @@
       src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
       64, 64, &sse2);
   const int se = se1 + se2;
+  unsigned int variance;
 
   *sse = sse1 + sse2;
 
-  return *sse - (((int64_t)se * se) >> 12);
+  variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
+  _mm256_zeroupper();
+  return variance;
 }
 
 unsigned int aom_sub_pixel_avg_variance32x32_avx2(
@@ -152,5 +185,8 @@
   // Process 32 elements in parallel.
   const int se = aom_sub_pixel_avg_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
+
+  const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
+  _mm256_zeroupper();
+  return variance;
 }

diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index ebded7e..999b541 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c

@@ -139,6 +139,7 @@
 
     *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
   }
+  _mm256_zeroupper();
 }
 
 void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
@@ -228,6 +229,7 @@
     *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
                     _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
   }
+  _mm256_zeroupper();
 }
 
 #define FILTER_SRC(filter)                               \
@@ -482,6 +484,7 @@
     }
   }
   CALC_SUM_AND_SSE
+  _mm256_zeroupper();
   return sum;
 }
 
@@ -705,5 +708,6 @@
     }
   }
   CALC_SUM_AND_SSE
+  _mm256_zeroupper();
   return sum;
 }

diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index 75e9719..d9563aa 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c

@@ -249,7 +249,7 @@
                 aom_get16x16var_sse2, 16);
   assert(sum <= 255 * 32 * 32);
   assert(sum >= -255 * 32 * 32);
-  return *sse - (((int64_t)sum * sum) >> 10);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
 }
 
 unsigned int aom_variance32x16_sse2(const uint8_t *src, int src_stride,
@@ -260,7 +260,7 @@
                 aom_get16x16var_sse2, 16);
   assert(sum <= 255 * 32 * 16);
   assert(sum >= -255 * 32 * 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
 }
 
 unsigned int aom_variance16x32_sse2(const uint8_t *src, int src_stride,
@@ -271,7 +271,7 @@
                 aom_get16x16var_sse2, 16);
   assert(sum <= 255 * 32 * 16);
   assert(sum >= -255 * 32 * 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
 }
 
 unsigned int aom_variance64x64_sse2(const uint8_t *src, int src_stride,
@@ -282,7 +282,7 @@
                 aom_get16x16var_sse2, 16);
   assert(sum <= 255 * 64 * 64);
   assert(sum >= -255 * 64 * 64);
-  return *sse - (((int64_t)sum * sum) >> 12);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
 }
 
 unsigned int aom_variance64x32_sse2(const uint8_t *src, int src_stride,
@@ -293,7 +293,7 @@
                 aom_get16x16var_sse2, 16);
   assert(sum <= 255 * 64 * 32);
   assert(sum >= -255 * 64 * 32);
-  return *sse - (((int64_t)sum * sum) >> 11);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
 unsigned int aom_variance32x64_sse2(const uint8_t *src, int src_stride,
@@ -304,7 +304,7 @@
                 aom_get16x16var_sse2, 16);
   assert(sum <= 255 * 64 * 32);
   assert(sum >= -255 * 64 * 32);
-  return *sse - (((int64_t)sum * sum) >> 11);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
 unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
@@ -381,7 +381,7 @@
       }                                                                        \
     }                                                                          \
     *sse_ptr = sse;                                                            \
-    return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2));                 \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
   }
 
 #define FNS(opt1, opt2)                              \
@@ -452,7 +452,7 @@
       }                                                                        \
     }                                                                          \
     *sseptr = sse;                                                             \
-    return sse - (cast_prod(cast se * se) >> (wlog2 + hlog2));                 \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
   }
 
 #define FNS(opt1, opt2)                              \
@@ -476,10 +476,10 @@
 #undef FNS
 #undef FN
 
-void aom_upsampled_pred_sse2(uint8_t *pred, int width, int height,
-                             const uint8_t *ref, const int ref_stride) {
-  const int stride = ref_stride << 3;
+void aom_upsampled_pred_sse2(uint8_t *comp_pred, int width, int height,
+                             const uint8_t *ref, int ref_stride) {
   int i, j;
+  int stride = ref_stride << 3;
 
   if (width >= 16) {
     // read 16 points at one time
@@ -512,8 +512,8 @@
         s4 = _mm_unpacklo_epi32(s4, s6);
         s0 = _mm_unpacklo_epi64(s0, s4);
 
-        _mm_storeu_si128((__m128i *)(pred), s0);
-        pred += 16;
+        _mm_storeu_si128((__m128i *)(comp_pred), s0);
+        comp_pred += 16;
         ref += 16 * 8;
       }
       ref += stride - (width << 3);
@@ -537,8 +537,8 @@
         s2 = _mm_unpacklo_epi8(t1, s3);
         s0 = _mm_unpacklo_epi32(s0, s2);
 
-        _mm_storel_epi64((__m128i *)(pred), s0);
-        pred += 8;
+        _mm_storel_epi64((__m128i *)(comp_pred), s0);
+        comp_pred += 8;
         ref += 8 * 8;
       }
       ref += stride - (width << 3);
@@ -555,9 +555,8 @@
         s1 = _mm_unpackhi_epi8(s0, s1);
         s0 = _mm_unpacklo_epi8(t0, s1);
 
-        *(int *)pred = _mm_cvtsi128_si32(s0);
-
-        pred += 4;
+        *(int *)comp_pred = _mm_cvtsi128_si32(s0);
+        comp_pred += 4;
         ref += 4 * 8;
       }
       ref += stride - (width << 3);
@@ -567,11 +566,11 @@
 
 void aom_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
                                       int width, int height, const uint8_t *ref,
-                                      const int ref_stride) {
+                                      int ref_stride) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
-  const int stride = ref_stride << 3;
   int i, j;
+  int stride = ref_stride << 3;
 
   if (width >= 16) {
     // read 16 points at one time

diff --git a/aom_ports/mem_ops.h b/aom_ports/mem_ops.h
index c87873f..6212675 100644
--- a/aom_ports/mem_ops.h
+++ b/aom_ports/mem_ops.h

@@ -90,7 +90,7 @@
   unsigned MEM_VALUE_T val;
   const MAU_T *mem = (const MAU_T *)vmem;
 
-  val = mem[0] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[0]) << 24;
   val |= mem[1] << 16;
   val |= mem[2] << 8;
   val |= mem[3];
@@ -126,7 +126,7 @@
   unsigned MEM_VALUE_T val;
   const MAU_T *mem = (const MAU_T *)vmem;
 
-  val = mem[3] << 24;
+  val = ((unsigned MEM_VALUE_T)mem[3]) << 24;
   val |= mem[2] << 16;
   val |= mem[1] << 8;
   val |= mem[0];
@@ -140,88 +140,90 @@
     return (val << (MEM_VALUE_T_SZ_BITS - sz)) >> (MEM_VALUE_T_SZ_BITS - sz); \
   }
 
-#undef mem_get_sbe16
+/* clang-format off */
+#undef  mem_get_sbe16
 #define mem_get_sbe16 mem_ops_wrap_symbol(mem_get_sbe16)
 mem_get_s_generic(be, 16)
-#undef mem_get_sbe24
+
+#undef  mem_get_sbe24
 #define mem_get_sbe24 mem_ops_wrap_symbol(mem_get_sbe24)
-    mem_get_s_generic(be, 24)
+mem_get_s_generic(be, 24)
 
-#undef mem_get_sbe32
+#undef  mem_get_sbe32
 #define mem_get_sbe32 mem_ops_wrap_symbol(mem_get_sbe32)
-        mem_get_s_generic(be, 32)
+mem_get_s_generic(be, 32)
 
-#undef mem_get_sle16
+#undef  mem_get_sle16
 #define mem_get_sle16 mem_ops_wrap_symbol(mem_get_sle16)
-            mem_get_s_generic(le, 16)
+mem_get_s_generic(le, 16)
 
-#undef mem_get_sle24
+#undef  mem_get_sle24
 #define mem_get_sle24 mem_ops_wrap_symbol(mem_get_sle24)
-                mem_get_s_generic(le, 24)
+mem_get_s_generic(le, 24)
 
-#undef mem_get_sle32
+#undef  mem_get_sle32
 #define mem_get_sle32 mem_ops_wrap_symbol(mem_get_sle32)
-                    mem_get_s_generic(le, 32)
+mem_get_s_generic(le, 32)
 
-#undef mem_put_be16
+#undef  mem_put_be16
 #define mem_put_be16 mem_ops_wrap_symbol(mem_put_be16)
-                        static AOM_INLINE
-    void mem_put_be16(void *vmem, MEM_VALUE_T val) {
+static AOM_INLINE void mem_put_be16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 8) & 0xff;
-  mem[1] = (val >> 0) & 0xff;
+  mem[0] = (MAU_T)((val >> 8) & 0xff);
+  mem[1] = (MAU_T)((val >> 0) & 0xff);
 }
 
-#undef mem_put_be24
+#undef  mem_put_be24
 #define mem_put_be24 mem_ops_wrap_symbol(mem_put_be24)
 static AOM_INLINE void mem_put_be24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 16) & 0xff;
-  mem[1] = (val >> 8) & 0xff;
-  mem[2] = (val >> 0) & 0xff;
+  mem[0] = (MAU_T)((val >> 16) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >>  0) & 0xff);
 }
 
-#undef mem_put_be32
+#undef  mem_put_be32
 #define mem_put_be32 mem_ops_wrap_symbol(mem_put_be32)
 static AOM_INLINE void mem_put_be32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 24) & 0xff;
-  mem[1] = (val >> 16) & 0xff;
-  mem[2] = (val >> 8) & 0xff;
-  mem[3] = (val >> 0) & 0xff;
+  mem[0] = (MAU_T)((val >> 24) & 0xff);
+  mem[1] = (MAU_T)((val >> 16) & 0xff);
+  mem[2] = (MAU_T)((val >>  8) & 0xff);
+  mem[3] = (MAU_T)((val >>  0) & 0xff);
 }
 
-#undef mem_put_le16
+#undef  mem_put_le16
 #define mem_put_le16 mem_ops_wrap_symbol(mem_put_le16)
 static AOM_INLINE void mem_put_le16(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 0) & 0xff;
-  mem[1] = (val >> 8) & 0xff;
+  mem[0] = (MAU_T)((val >> 0) & 0xff);
+  mem[1] = (MAU_T)((val >> 8) & 0xff);
 }
 
-#undef mem_put_le24
+#undef  mem_put_le24
 #define mem_put_le24 mem_ops_wrap_symbol(mem_put_le24)
 static AOM_INLINE void mem_put_le24(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 0) & 0xff;
-  mem[1] = (val >> 8) & 0xff;
-  mem[2] = (val >> 16) & 0xff;
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
 }
 
-#undef mem_put_le32
+#undef  mem_put_le32
 #define mem_put_le32 mem_ops_wrap_symbol(mem_put_le32)
 static AOM_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   MAU_T *mem = (MAU_T *)vmem;
 
-  mem[0] = (val >> 0) & 0xff;
-  mem[1] = (val >> 8) & 0xff;
-  mem[2] = (val >> 16) & 0xff;
-  mem[3] = (val >> 24) & 0xff;
+  mem[0] = (MAU_T)((val >>  0) & 0xff);
+  mem[1] = (MAU_T)((val >>  8) & 0xff);
+  mem[2] = (MAU_T)((val >> 16) & 0xff);
+  mem[3] = (MAU_T)((val >> 24) & 0xff);
 }
+/* clang-format on */
 
 #endif  // AOM_PORTS_MEM_OPS_H_

diff --git a/aom_ports/mem_ops_aligned.h b/aom_ports/mem_ops_aligned.h
index 6dcf321..8c3ab1c 100644
--- a/aom_ports/mem_ops_aligned.h
+++ b/aom_ports/mem_ops_aligned.h

@@ -28,9 +28,9 @@
 /* Architectures that provide instructions for doing this byte swapping
  * could redefine these macros.
  */
-#define swap_endian_16(val, raw)                         \
-  do {                                                   \
-    val = ((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00); \
+#define swap_endian_16(val, raw)                                     \
+  do {                                                               \
+    val = (uint16_t)(((raw >> 8) & 0x00ff) | ((raw << 8) & 0xff00)); \
   } while (0)
 #define swap_endian_32(val, raw)                                   \
   do {                                                             \
@@ -108,53 +108,54 @@
 #define mem_put_le_aligned_generic(sz) mem_put_ne_aligned_generic(le, sz)
 #endif
 
-#undef mem_get_be16_aligned
+/* clang-format off */
+#undef  mem_get_be16_aligned
 #define mem_get_be16_aligned mem_ops_wrap_symbol(mem_get_be16_aligned)
 mem_get_be_aligned_generic(16)
 
-#undef mem_get_be32_aligned
+#undef  mem_get_be32_aligned
 #define mem_get_be32_aligned mem_ops_wrap_symbol(mem_get_be32_aligned)
-    mem_get_be_aligned_generic(32)
+mem_get_be_aligned_generic(32)
 
-#undef mem_get_le16_aligned
+#undef  mem_get_le16_aligned
 #define mem_get_le16_aligned mem_ops_wrap_symbol(mem_get_le16_aligned)
-        mem_get_le_aligned_generic(16)
+mem_get_le_aligned_generic(16)
 
-#undef mem_get_le32_aligned
+#undef  mem_get_le32_aligned
 #define mem_get_le32_aligned mem_ops_wrap_symbol(mem_get_le32_aligned)
-            mem_get_le_aligned_generic(32)
+mem_get_le_aligned_generic(32)
 
-#undef mem_get_sbe16_aligned
+#undef  mem_get_sbe16_aligned
 #define mem_get_sbe16_aligned mem_ops_wrap_symbol(mem_get_sbe16_aligned)
-                mem_get_sbe_aligned_generic(16)
+mem_get_sbe_aligned_generic(16)
 
-#undef mem_get_sbe32_aligned
+#undef  mem_get_sbe32_aligned
 #define mem_get_sbe32_aligned mem_ops_wrap_symbol(mem_get_sbe32_aligned)
-                    mem_get_sbe_aligned_generic(32)
+mem_get_sbe_aligned_generic(32)
 
-#undef mem_get_sle16_aligned
+#undef  mem_get_sle16_aligned
 #define mem_get_sle16_aligned mem_ops_wrap_symbol(mem_get_sle16_aligned)
-                        mem_get_sle_aligned_generic(16)
+mem_get_sle_aligned_generic(16)
 
-#undef mem_get_sle32_aligned
+#undef  mem_get_sle32_aligned
 #define mem_get_sle32_aligned mem_ops_wrap_symbol(mem_get_sle32_aligned)
-                            mem_get_sle_aligned_generic(32)
+mem_get_sle_aligned_generic(32)
 
-#undef mem_put_be16_aligned
+#undef  mem_put_be16_aligned
 #define mem_put_be16_aligned mem_ops_wrap_symbol(mem_put_be16_aligned)
-                                mem_put_be_aligned_generic(16)
+mem_put_be_aligned_generic(16)
 
-#undef mem_put_be32_aligned
+#undef  mem_put_be32_aligned
 #define mem_put_be32_aligned mem_ops_wrap_symbol(mem_put_be32_aligned)
-                                    mem_put_be_aligned_generic(32)
+mem_put_be_aligned_generic(32)
 
-#undef mem_put_le16_aligned
+#undef  mem_put_le16_aligned
 #define mem_put_le16_aligned mem_ops_wrap_symbol(mem_put_le16_aligned)
-                                        mem_put_le_aligned_generic(16)
+mem_put_le_aligned_generic(16)
 
-#undef mem_put_le32_aligned
+#undef  mem_put_le32_aligned
 #define mem_put_le32_aligned mem_ops_wrap_symbol(mem_put_le32_aligned)
-                                            mem_put_le_aligned_generic(32)
+mem_put_le_aligned_generic(32)
 
 #undef mem_get_ne_aligned_generic
 #undef mem_get_se_aligned_generic
@@ -166,5 +167,6 @@
 #undef swap_endian_32
 #undef swap_endian_16_se
 #undef swap_endian_32_se
+/* clang-format on */
 
 #endif  // AOM_PORTS_MEM_OPS_ALIGNED_H_

diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index 7cad83f..e5680ca 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h

@@ -12,6 +12,11 @@
 #ifndef AOM_PORTS_X86_H_
 #define AOM_PORTS_X86_H_
 #include <stdlib.h>
+
+#if defined(_MSC_VER)
+#include <intrin.h> /* For __cpuidex, __rdtsc */
+#endif
+
 #include "aom_config.h"
 #include "aom/aom_integer.h"
 
@@ -77,8 +82,6 @@
 #else /* end __SUNPRO__ */
 #if ARCH_X86_64
 #if defined(_MSC_VER) && _MSC_VER > 1500
-void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
-#pragma intrinsic(__cpuidex)
 #define cpuid(func, func2, a, b, c, d) \
   do {                                 \
     int regs[4];                       \
@@ -89,8 +92,6 @@
     d = regs[3];                       \
   } while (0)
 #else
-void __cpuid(int CPUInfo[4], int info_type);
-#pragma intrinsic(__cpuid)
 #define cpuid(func, func2, a, b, c, d) \
   do {                                 \
     int regs[4];                       \
@@ -103,13 +104,13 @@
 #endif
 #else
 /* clang-format off */
-#define cpuid(func, func2, a, b, c, d)\
-  __asm mov eax, func\
-  __asm mov ecx, func2\
-  __asm cpuid\
-  __asm mov a, eax\
-  __asm mov b, ebx\
-  __asm mov c, ecx\
+#define cpuid(func, func2, a, b, c, d) \
+  __asm mov eax, func                  \
+  __asm mov ecx, func2                 \
+  __asm cpuid                          \
+  __asm mov a, eax                     \
+  __asm mov b, ebx                     \
+  __asm mov c, ecx                     \
   __asm mov d, edx
 #endif
 /* clang-format on */
@@ -179,7 +180,7 @@
 
   env = getenv("AOM_SIMD_CAPS_MASK");
 
-  if (env && *env) mask = strtol(env, NULL, 0);
+  if (env && *env) mask = (unsigned int)strtoul(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
   cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
@@ -218,10 +219,11 @@
   return flags & mask;
 }
 
-#if ARCH_X86_64 && defined(_MSC_VER)
-unsigned __int64 __rdtsc(void);
-#pragma intrinsic(__rdtsc)
-#endif
+// Note:
+//  32-bit CPU cycle counter is light-weighted for most function performance
+//  measurement. For large function (CPU time > a couple of seconds), 64-bit
+//  counter should be used.
+// 32-bit CPU cycle counter
 static INLINE unsigned int x86_readtsc(void) {
 #if defined(__GNUC__) && __GNUC__
   unsigned int tsc;
@@ -239,6 +241,24 @@
 #endif
 #endif
 }
+// 64-bit CPU cycle counter
+static INLINE uint64_t x86_readtsc64(void) {
+#if defined(__GNUC__) && __GNUC__
+  uint32_t hi, lo;
+  __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  uint_t hi, lo;
+  asm volatile("rdtsc\n\t" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#else
+#if ARCH_X86_64
+  return (uint64_t)__rdtsc();
+#else
+  __asm rdtsc;
+#endif
+#endif
+}
 
 #if defined(__GNUC__) && __GNUC__
 #define x86_pause_hint() __asm__ __volatile__("pause \n\t")

diff --git a/aom_scale/aom_scale_rtcd.pl b/aom_scale/aom_scale_rtcd.pl
index fd6b577..925530a 100644
--- a/aom_scale/aom_scale_rtcd.pl
+++ b/aom_scale/aom_scale_rtcd.pl

@@ -8,12 +8,12 @@
 # Scaler functions
 if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
     add_proto qw/void aom_horizontal_line_5_4_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
-    add_proto qw/void aom_vertical_band_5_4_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void aom_vertical_band_5_4_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
     add_proto qw/void aom_horizontal_line_5_3_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
-    add_proto qw/void aom_vertical_band_5_3_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void aom_vertical_band_5_3_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
     add_proto qw/void aom_horizontal_line_2_1_scale/, "const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width";
-    add_proto qw/void aom_vertical_band_2_1_scale/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
-    add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width";
+    add_proto qw/void aom_vertical_band_2_1_scale/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
+    add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
 }
 
 add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";

diff --git a/aom_scale/generic/aom_scale.c b/aom_scale/generic/aom_scale.c
index 28604ac..14f3ae0 100644
--- a/aom_scale/generic/aom_scale.c
+++ b/aom_scale/generic/aom_scale.c

@@ -68,7 +68,6 @@
                           unsigned int source_scale, unsigned int source_length,
                           unsigned char *dest, int dest_step,
                           unsigned int dest_scale, unsigned int dest_length) {
-  const unsigned int source_pitch = source_step;
   const unsigned char *const dest_end = dest + dest_length * dest_step;
   (void)source_length;
   (void)source_scale;
@@ -81,9 +80,9 @@
   dest += dest_step;
 
   while (dest < dest_end) {
-    const unsigned int a = 3 * source[-source_pitch];
+    const unsigned int a = 3 * source[-source_step];
     const unsigned int b = 10 * source[0];
-    const unsigned int c = 3 * source[source_pitch];
+    const unsigned int c = 3 * source[source_step];
     *dest = (unsigned char)((8 + a + b + c) >> 4);
     source += source_step;
     dest += dest_step;
@@ -253,8 +252,8 @@
 
   void (*horiz_line_scale)(const unsigned char *, unsigned int, unsigned char *,
                            unsigned int) = NULL;
-  void (*vert_band_scale)(unsigned char *, unsigned int, unsigned char *,
-                          unsigned int, unsigned int) = NULL;
+  void (*vert_band_scale)(unsigned char *, int, unsigned char *, int,
+                          unsigned int) = NULL;
 
   int ratio_scalable = 1;
   int interpolation = 0;

diff --git a/aom_scale/generic/gen_scalers.c b/aom_scale/generic/gen_scalers.c
index fd638bd..71fa82f 100644
--- a/aom_scale/generic/gen_scalers.c
+++ b/aom_scale/generic/gen_scalers.c

@@ -59,9 +59,8 @@
   }
 }
 
-void aom_vertical_band_5_4_scale_c(unsigned char *source,
-                                   unsigned int src_pitch, unsigned char *dest,
-                                   unsigned int dest_pitch,
+void aom_vertical_band_5_4_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
                                    unsigned int dest_width) {
   const unsigned char *const dest_end = dest + dest_width;
   while (dest < dest_end) {
@@ -124,9 +123,8 @@
   }
 }
 
-void aom_vertical_band_5_3_scale_c(unsigned char *source,
-                                   unsigned int src_pitch, unsigned char *dest,
-                                   unsigned int dest_pitch,
+void aom_vertical_band_5_3_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
                                    unsigned int dest_width) {
   const unsigned char *const dest_end = dest + dest_width;
   while (dest < dest_end) {
@@ -178,19 +176,16 @@
   }
 }
 
-void aom_vertical_band_2_1_scale_c(unsigned char *source,
-                                   unsigned int src_pitch, unsigned char *dest,
-                                   unsigned int dest_pitch,
+void aom_vertical_band_2_1_scale_c(unsigned char *source, int src_pitch,
+                                   unsigned char *dest, int dest_pitch,
                                    unsigned int dest_width) {
   (void)dest_pitch;
   (void)src_pitch;
   memcpy(dest, source, dest_width);
 }
 
-void aom_vertical_band_2_1_scale_i_c(unsigned char *source,
-                                     unsigned int src_pitch,
-                                     unsigned char *dest,
-                                     unsigned int dest_pitch,
+void aom_vertical_band_2_1_scale_i_c(unsigned char *source, int src_pitch,
+                                     unsigned char *dest, int dest_pitch,
                                      unsigned int dest_width) {
   const unsigned char *const dest_end = dest + dest_width;
   (void)dest_pitch;

diff --git a/aom_scale/generic/yv12config.c b/aom_scale/generic/yv12config.c
index 8e1ac0b..ebaf957 100644
--- a/aom_scale/generic/yv12config.c
+++ b/aom_scale/generic/yv12config.c

@@ -25,95 +25,6 @@
 #define yv12_align_addr(addr, align) \
   (void *)(((size_t)(addr) + ((align)-1)) & (size_t) - (align))
 
-int aom_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
-  if (ybf) {
-    // If libaom is using frame buffer callbacks then buffer_alloc_sz must
-    // not be set.
-    if (ybf->buffer_alloc_sz > 0) {
-      aom_free(ybf->buffer_alloc);
-    }
-
-    /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
-      u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
-      all of this so that a freed pointer isn't inadvertently used */
-    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
-  } else {
-    return -1;
-  }
-
-  return 0;
-}
-
-int aom_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
-                                  int height, int border) {
-  if (ybf) {
-    int aligned_width = (width + 15) & ~15;
-    int aligned_height = (height + 15) & ~15;
-    int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
-    int yplane_size = (aligned_height + 2 * border) * y_stride;
-    int uv_width = aligned_width >> 1;
-    int uv_height = aligned_height >> 1;
-    /** There is currently a bunch of code which assumes
-      *  uv_stride == y_stride/2, so enforce this here. */
-    int uv_stride = y_stride >> 1;
-    int uvplane_size = (uv_height + border) * uv_stride;
-    const int frame_size = yplane_size + 2 * uvplane_size;
-
-    if (!ybf->buffer_alloc) {
-      ybf->buffer_alloc = (uint8_t *)aom_memalign(32, frame_size);
-      ybf->buffer_alloc_sz = frame_size;
-    }
-
-    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1;
-
-    /* Only support allocating buffers that have a border that's a multiple
-     * of 32. The border restriction is required to get 16-byte alignment of
-     * the start of the chroma rows without introducing an arbitrary gap
-     * between planes, which would break the semantics of things like
-     * aom_img_set_rect(). */
-    if (border & 0x1f) return -3;
-
-    ybf->y_crop_width = width;
-    ybf->y_crop_height = height;
-    ybf->y_width = aligned_width;
-    ybf->y_height = aligned_height;
-    ybf->y_stride = y_stride;
-
-    ybf->uv_crop_width = (width + 1) / 2;
-    ybf->uv_crop_height = (height + 1) / 2;
-    ybf->uv_width = uv_width;
-    ybf->uv_height = uv_height;
-    ybf->uv_stride = uv_stride;
-
-    ybf->alpha_width = 0;
-    ybf->alpha_height = 0;
-    ybf->alpha_stride = 0;
-
-    ybf->border = border;
-    ybf->frame_size = frame_size;
-
-    ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
-    ybf->u_buffer =
-        ybf->buffer_alloc + yplane_size + (border / 2 * uv_stride) + border / 2;
-    ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
-                    (border / 2 * uv_stride) + border / 2;
-    ybf->alpha_buffer = NULL;
-
-    ybf->corrupted = 0; /* assume not currupted by errors */
-    return 0;
-  }
-  return -2;
-}
-
-int aom_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                                int border) {
-  if (ybf) {
-    aom_yv12_de_alloc_frame_buffer(ybf);
-    return aom_yv12_realloc_frame_buffer(ybf, width, height, border);
-  }
-  return -2;
-}
-
 #if CONFIG_AV1
 // TODO(jkoleszar): Maybe replace this with struct aom_image
 
@@ -199,7 +110,7 @@
       ybf->buffer_alloc = (uint8_t *)aom_memalign(32, (size_t)frame_size);
       if (!ybf->buffer_alloc) return -1;
 
-      ybf->buffer_alloc_sz = (int)frame_size;
+      ybf->buffer_alloc_sz = (size_t)frame_size;
 
       // This memset is needed for fixing valgrind error from C loop filter
       // due to access uninitialized memory in frame border. It could be
@@ -227,7 +138,7 @@
     ybf->uv_stride = uv_stride;
 
     ybf->border = border;
-    ybf->frame_size = (int)frame_size;
+    ybf->frame_size = (size_t)frame_size;
     ybf->subsampling_x = ss_x;
     ybf->subsampling_y = ss_y;
 

diff --git a/aom_scale/generic/yv12extend.c b/aom_scale/generic/yv12extend.c
index b6b3c22..5c7a052 100644
--- a/aom_scale/generic/yv12extend.c
+++ b/aom_scale/generic/yv12extend.c

@@ -16,9 +16,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_scale/yv12config.h"
-#if CONFIG_AOM_HIGHBITDEPTH
-#include "av1/common/common.h"
-#endif
 
 static void extend_plane(uint8_t *const src, int src_stride, int width,
                          int height, int extend_top, int extend_left,
@@ -198,34 +195,35 @@
 }
 
 void aom_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
-  const int ext_size = ybf->border;
+  int ext_size = ybf->border;
   assert(ybf->y_height - ybf->y_crop_height < 16);
   assert(ybf->y_width - ybf->y_crop_width < 16);
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
 #if CONFIG_AOM_HIGHBITDEPTH
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH)
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
     extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                       ybf->y_crop_height, ext_size, ext_size,
                       ext_size + ybf->y_height - ybf->y_crop_height,
                       ext_size + ybf->y_width - ybf->y_crop_width);
-  else
+    return;
+  }
 #endif
-    extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-                 ybf->y_crop_height, ext_size, ext_size,
-                 ext_size + ybf->y_height - ybf->y_crop_height,
-                 ext_size + ybf->y_width - ybf->y_crop_width);
+  extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
+               ybf->y_crop_height, ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
 }
+#endif  // CONFIG_AV1
 
 #if CONFIG_AOM_HIGHBITDEPTH
-void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
+static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   memcpy(dst, src, num * sizeof(uint16_t));
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-#endif  // CONFIG_AV1
 
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.

diff --git a/aom_scale/yv12config.h b/aom_scale/yv12config.h
index 74a0d55..315718e 100644
--- a/aom_scale/yv12config.h
+++ b/aom_scale/yv12config.h

@@ -21,8 +21,12 @@
 #include "aom/aom_frame_buffer.h"
 #include "aom/aom_integer.h"
 
-#define AOMBORDERINPIXELS 32
+#define VP8BORDERINPIXELS 32
+#if CONFIG_EXT_PARTITION
+#define AOMINNERBORDERINPIXELS 160
+#else
 #define AOMINNERBORDERINPIXELS 96
+#endif  // CONFIG_EXT_PARTITION
 #define AOM_INTERP_EXTEND 4
 
 // TODO(jingning): Use unified inter predictor for encoder and
@@ -53,9 +57,9 @@
   uint8_t *alpha_buffer;
 
   uint8_t *buffer_alloc;
-  int buffer_alloc_sz;
+  size_t buffer_alloc_sz;
   int border;
-  int frame_size;
+  size_t frame_size;
   int subsampling_x;
   int subsampling_y;
   unsigned int bit_depth;
@@ -70,12 +74,6 @@
 
 #define YV12_FLAG_HIGHBITDEPTH 8
 
-int aom_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
-                                int border);
-int aom_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
-                                  int height, int border);
-int aom_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf);
-
 int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                            int ss_x, int ss_y,
 #if CONFIG_AOM_HIGHBITDEPTH

diff --git a/aomdec.c b/aomdec.c
index 6c4f13c..e88c81f 100644
--- a/aomdec.c
+++ b/aomdec.c

@@ -93,19 +93,43 @@
 static const arg_def_t outbitdeptharg =
     ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
+#if CONFIG_EXT_TILE
+static const arg_def_t tiler = ARG_DEF(NULL, "tile-row", 1,
+                                       "Row index of tile to decode "
+                                       "(-1 for all rows)");
+static const arg_def_t tilec = ARG_DEF(NULL, "tile-column", 1,
+                                       "Column index of tile to decode "
+                                       "(-1 for all columns)");
+#endif  // CONFIG_EXT_TILE
 
-/* clang-format off */
-static const arg_def_t *all_args[] = {
-  &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
-  &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &frameparallelarg, &verbosearg, &scalearg, &fb_arg,
-  &md5arg, &error_concealment, &continuearg,
+static const arg_def_t *all_args[] = { &codecarg,
+                                       &use_yv12,
+                                       &use_i420,
+                                       &flipuvarg,
+                                       &rawvideo,
+                                       &noblitarg,
+                                       &progressarg,
+                                       &limitarg,
+                                       &skiparg,
+                                       &postprocarg,
+                                       &summaryarg,
+                                       &outputfile,
+                                       &threadsarg,
+                                       &frameparallelarg,
+                                       &verbosearg,
+                                       &scalearg,
+                                       &fb_arg,
+                                       &md5arg,
+                                       &error_concealment,
+                                       &continuearg,
 #if CONFIG_AOM_HIGHBITDEPTH
-  &outbitdeptharg,
+                                       &outbitdeptharg,
 #endif
-  NULL
-};
-/* clang-format on */
+#if CONFIG_EXT_TILE
+                                       &tiler,
+                                       &tilec,
+#endif  // CONFIG_EXT_TILE
+                                       NULL };
 
 #if CONFIG_LIBYUV
 static INLINE int libyuv_scale(aom_image_t *src, aom_image_t *dst,
@@ -489,6 +513,10 @@
 #if CONFIG_AOM_HIGHBITDEPTH
   unsigned int output_bit_depth = 0;
 #endif
+#if CONFIG_EXT_TILE
+  int tile_row = -1;
+  int tile_col = -1;
+#endif  // CONFIG_EXT_TILE
   int frames_corrupted = 0;
   int dec_flags = 0;
   int do_scale = 0;
@@ -577,6 +605,12 @@
       output_bit_depth = arg_parse_uint(&arg);
     }
 #endif
+#if CONFIG_EXT_TILE
+    else if (arg_match(&arg, &tiler, argi))
+      tile_row = arg_parse_int(&arg);
+    else if (arg_match(&arg, &tilec, argi))
+      tile_col = arg_parse_int(&arg);
+#endif  // CONFIG_EXT_TILE
     else
       argj++;
   }
@@ -677,6 +711,22 @@
 
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
+#if CONFIG_AV1_DECODER && CONFIG_EXT_TILE
+  if (strncmp(decoder.name, "WebM Project AV1", 17) == 0) {
+    if (aom_codec_control(&decoder, AV1_SET_DECODE_TILE_ROW, tile_row)) {
+      fprintf(stderr, "Failed to set decode_tile_row: %s\n",
+              aom_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+
+    if (aom_codec_control(&decoder, AV1_SET_DECODE_TILE_COL, tile_col)) {
+      fprintf(stderr, "Failed to set decode_tile_col: %s\n",
+              aom_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+  }
+#endif
+
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
   while (arg_skip) {
     if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
@@ -836,6 +886,11 @@
       }
 #endif
 
+#if CONFIG_EXT_TILE
+      aom_input_ctx.width = img->d_w;
+      aom_input_ctx.height = img->d_h;
+#endif  // CONFIG_EXT_TILE
+
       if (single_file) {
         if (use_y4m) {
           char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };

diff --git a/aomenc.c b/aomenc.c
index 1a948be..ef174c2 100644
--- a/aomenc.c
+++ b/aomenc.c

@@ -375,7 +375,8 @@
 static const arg_def_t tile_cols =
     ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows =
-    ARG_DEF(NULL, "tile-rows", 1, "Number of tile rows to use, log2");
+    ARG_DEF(NULL, "tile-rows", 1,
+            "Number of tile rows to use, log2 (set to 0 while threads > 1)");
 static const arg_def_t lossless =
     ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
 #if CONFIG_AOM_QM
@@ -443,40 +444,75 @@
 #endif
 
 #if CONFIG_AV1_ENCODER
-/* clang-format off */
-static const arg_def_t *av1_args[] = {
-  &cpu_used_av1,            &auto_altref,      &sharpness,
-  &static_thresh,           &tile_cols,        &tile_rows,
-  &arnr_maxframes,          &arnr_strength,    &arnr_type,
-  &tune_ssim,               &cq_level,         &max_intra_rate_pct,
-  &max_inter_rate_pct,      &gf_cbr_boost_pct, &lossless,
-#if CONFIG_AOM_QM
-  &enable_qm,               &qm_min,           &qm_max,
-#endif
-  &frame_parallel_decoding, &aq_mode,          &frame_periodic_boost,
-  &noise_sens,              &tune_content,     &input_color_space,
-  &min_gf_interval,         &max_gf_interval,  NULL
+#if CONFIG_EXT_PARTITION
+static const struct arg_enum_list superblock_size_enum[] = {
+  { "dynamic", AOM_SUPERBLOCK_SIZE_DYNAMIC },
+  { "64", AOM_SUPERBLOCK_SIZE_64X64 },
+  { "128", AOM_SUPERBLOCK_SIZE_128X128 },
+  { NULL, 0 }
 };
-static const int av1_arg_ctrl_map[] = {
-  AOME_SET_CPUUSED,                 AOME_SET_ENABLEAUTOALTREF,
-  AOME_SET_SHARPNESS,               AOME_SET_STATIC_THRESHOLD,
-  AV1E_SET_TILE_COLUMNS,            AV1E_SET_TILE_ROWS,
-  AOME_SET_ARNR_MAXFRAMES,          AOME_SET_ARNR_STRENGTH,
-  AOME_SET_ARNR_TYPE,               AOME_SET_TUNING,
-  AOME_SET_CQ_LEVEL,                AOME_SET_MAX_INTRA_BITRATE_PCT,
-  AV1E_SET_MAX_INTER_BITRATE_PCT,   AV1E_SET_GF_CBR_BOOST_PCT,
-  AV1E_SET_LOSSLESS,
-#if CONFIG_AOM_QM
-  AV1E_SET_ENABLE_QM,               AV1E_SET_QM_MIN,
-  AV1E_SET_QM_MAX,
-#endif
-  AV1E_SET_FRAME_PARALLEL_DECODING, AV1E_SET_AQ_MODE,
-  AV1E_SET_FRAME_PERIODIC_BOOST,    AV1E_SET_NOISE_SENSITIVITY,
-  AV1E_SET_TUNE_CONTENT,            AV1E_SET_COLOR_SPACE,
-  AV1E_SET_MIN_GF_INTERVAL,         AV1E_SET_MAX_GF_INTERVAL,
-  0
-};
-/* clang-format on */
+static const arg_def_t superblock_size = ARG_DEF_ENUM(
+    NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
+#endif  // CONFIG_EXT_PARTITION
+
+static const arg_def_t *av1_args[] = { &cpu_used_av1,
+                                       &auto_altref,
+                                       &sharpness,
+                                       &static_thresh,
+                                       &tile_cols,
+                                       &tile_rows,
+                                       &arnr_maxframes,
+                                       &arnr_strength,
+                                       &arnr_type,
+                                       &tune_ssim,
+                                       &cq_level,
+                                       &max_intra_rate_pct,
+                                       &max_inter_rate_pct,
+                                       &gf_cbr_boost_pct,
+                                       &lossless,
+                                       &frame_parallel_decoding,
+                                       &aq_mode,
+                                       &frame_periodic_boost,
+                                       &noise_sens,
+                                       &tune_content,
+                                       &input_color_space,
+                                       &min_gf_interval,
+                                       &max_gf_interval,
+#if CONFIG_EXT_PARTITION
+                                       &superblock_size,
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_AOM_HIGHBITDEPTH
+                                       &bitdeptharg,
+                                       &inbitdeptharg,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                                       NULL };
+static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
+                                        AOME_SET_ENABLEAUTOALTREF,
+                                        AOME_SET_SHARPNESS,
+                                        AOME_SET_STATIC_THRESHOLD,
+                                        AV1E_SET_TILE_COLUMNS,
+                                        AV1E_SET_TILE_ROWS,
+                                        AOME_SET_ARNR_MAXFRAMES,
+                                        AOME_SET_ARNR_STRENGTH,
+                                        AOME_SET_ARNR_TYPE,
+                                        AOME_SET_TUNING,
+                                        AOME_SET_CQ_LEVEL,
+                                        AOME_SET_MAX_INTRA_BITRATE_PCT,
+                                        AV1E_SET_MAX_INTER_BITRATE_PCT,
+                                        AV1E_SET_GF_CBR_BOOST_PCT,
+                                        AV1E_SET_LOSSLESS,
+                                        AV1E_SET_FRAME_PARALLEL_DECODING,
+                                        AV1E_SET_AQ_MODE,
+                                        AV1E_SET_FRAME_PERIODIC_BOOST,
+                                        AV1E_SET_NOISE_SENSITIVITY,
+                                        AV1E_SET_TUNE_CONTENT,
+                                        AV1E_SET_COLOR_SPACE,
+                                        AV1E_SET_MIN_GF_INTERVAL,
+                                        AV1E_SET_MAX_GF_INTERVAL,
+#if CONFIG_EXT_PARTITION
+                                        AV1E_SET_SUPERBLOCK_SIZE,
+#endif  // CONFIG_EXT_PARTITION
+                                        0 };
 #endif
 
 static const arg_def_t *no_args[] = { NULL };
@@ -768,7 +804,6 @@
   int arg_ctrls[ARG_CTRL_CNT_MAX][2];
   int arg_ctrl_cnt;
   int write_webm;
-  int have_kf_max_dist;
 #if CONFIG_AOM_HIGHBITDEPTH
   // whether to use 16bit internal buffers
   int use_16bit_internal;
@@ -1163,7 +1198,6 @@
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_max_dist, argi)) {
       config->cfg.kf_max_dist = arg_parse_uint(&arg);
-      config->have_kf_max_dist = 1;
     } else if (arg_match(&arg, &kf_disabled, argi)) {
       config->cfg.kf_mode = AOM_KF_DISABLED;
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -1287,19 +1321,6 @@
   }
 }
 
-static void set_default_kf_interval(struct stream_state *stream,
-                                    struct AvxEncoderConfig *global) {
-  /* Use a max keyframe interval of 5 seconds, if none was
-   * specified on the command line.
-   */
-  if (!stream->config.have_kf_max_dist) {
-    double global_framerate =
-        (double)global->framerate.num / global->framerate.den;
-    if (global_framerate > 0.0)
-      stream->config.cfg.kf_max_dist = (unsigned int)(5.0 * global_framerate);
-  }
-}
-
 static const char *file_type_to_string(enum VideoFileType t) {
   switch (t) {
     case FILE_TYPE_RAW: return "RAW";
@@ -1394,9 +1415,8 @@
 #if CONFIG_WEBM_IO
   if (stream->config.write_webm) {
     stream->webm_ctx.stream = stream->file;
-    write_webm_file_header(&stream->webm_ctx, cfg, &global->framerate,
-                           stream->config.stereo_fmt, global->codec->fourcc,
-                           pixel_aspect_ratio);
+    write_webm_file_header(&stream->webm_ctx, cfg, stream->config.stereo_fmt,
+                           global->codec->fourcc, pixel_aspect_ratio);
   }
 #else
   (void)pixel_aspect_ratio;
@@ -1497,7 +1517,18 @@
 #if CONFIG_DECODERS
   if (global->test_decode != TEST_DECODE_OFF) {
     const AvxInterface *decoder = get_aom_decoder_by_name(global->codec->name);
-    aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), NULL, 0);
+    aom_codec_dec_cfg_t cfg = { 0, 0, 0 };
+    aom_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
+
+#if CONFIG_AV1_DECODER && CONFIG_EXT_TILE
+    if (strcmp(global->codec->name, "av1") == 0) {
+      aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_ROW, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
+
+      aom_codec_control(&stream->decoder, AV1_SET_DECODE_TILE_COL, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
+    }
+#endif
   }
 #endif
 }
@@ -1719,33 +1750,50 @@
 }
 
 static void test_decode(struct stream_state *stream,
-                        enum TestDecodeFatality fatal) {
+                        enum TestDecodeFatality fatal,
+                        const AvxInterface *codec) {
   aom_image_t enc_img, dec_img;
-  struct av1_ref_frame ref_enc, ref_dec;
 
   if (stream->mismatch_seen) return;
 
-  ref_enc.idx = 0;
-  ref_dec.idx = 0;
-  aom_codec_control(&stream->encoder, AV1_GET_REFERENCE, &ref_enc);
-  enc_img = ref_enc.img;
-  aom_codec_control(&stream->decoder, AV1_GET_REFERENCE, &ref_dec);
-  dec_img = ref_dec.img;
+  /* Get the internal reference frame */
+  if (strcmp(codec->name, "vp8") == 0) {
+    struct aom_ref_frame ref_enc, ref_dec;
+    const unsigned int frame_width = (stream->config.cfg.g_w + 15) & ~15;
+    const unsigned int frame_height = (stream->config.cfg.g_h + 15) & ~15;
+    aom_img_alloc(&ref_enc.img, AOM_IMG_FMT_I420, frame_width, frame_height, 1);
+    enc_img = ref_enc.img;
+    aom_img_alloc(&ref_dec.img, AOM_IMG_FMT_I420, frame_width, frame_height, 1);
+    dec_img = ref_dec.img;
+
+    ref_enc.frame_type = AOM_LAST_FRAME;
+    ref_dec.frame_type = AOM_LAST_FRAME;
+    aom_codec_control(&stream->encoder, AOM_COPY_REFERENCE, &ref_enc);
+    aom_codec_control(&stream->decoder, AOM_COPY_REFERENCE, &ref_dec);
+  } else {
+    aom_codec_control(&stream->encoder, AV1_GET_NEW_FRAME_IMAGE, &enc_img);
+    aom_codec_control(&stream->decoder, AV1_GET_NEW_FRAME_IMAGE, &dec_img);
+
 #if CONFIG_AOM_HIGHBITDEPTH
-  if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
-      (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
-    if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
-      aom_img_alloc(&enc_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
-                    enc_img.d_w, enc_img.d_h, 16);
-      aom_img_truncate_16_to_8(&enc_img, &ref_enc.img);
+    if ((enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) !=
+        (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH)) {
+      if (enc_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+        aom_image_t enc_hbd_img;
+        aom_img_alloc(&enc_hbd_img, enc_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                      enc_img.d_w, enc_img.d_h, 16);
+        aom_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+        enc_img = enc_hbd_img;
+      }
+      if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+        aom_image_t dec_hbd_img;
+        aom_img_alloc(&dec_hbd_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
+                      dec_img.d_w, dec_img.d_h, 16);
+        aom_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+        dec_img = dec_hbd_img;
+      }
     }
-    if (dec_img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
-      aom_img_alloc(&dec_img, dec_img.fmt - AOM_IMG_FMT_HIGHBITDEPTH,
-                    dec_img.d_w, dec_img.d_h, 16);
-      aom_img_truncate_16_to_8(&dec_img, &ref_dec.img);
-    }
-  }
 #endif
+  }
   ctx_exit_on_error(&stream->encoder, "Failed to get encoder reference frame");
   ctx_exit_on_error(&stream->decoder, "Failed to get decoder reference frame");
 
@@ -2019,10 +2067,10 @@
     if (!global.have_framerate) {
       global.framerate.num = input.framerate.numerator;
       global.framerate.den = input.framerate.denominator;
+      FOREACH_STREAM(stream->config.cfg.g_timebase.den = global.framerate.num;
+                     stream->config.cfg.g_timebase.num = global.framerate.den);
     }
 
-    FOREACH_STREAM(set_default_kf_interval(stream, &global));
-
     /* Show configuration */
     if (global.verbose && pass == 0)
       FOREACH_STREAM(show_stream_config(stream, &global, &input));
@@ -2173,7 +2221,7 @@
         }
 
         if (got_data && global.test_decode != TEST_DECODE_OFF)
-          FOREACH_STREAM(test_decode(stream, global.test_decode));
+          FOREACH_STREAM(test_decode(stream, global.test_decode, global.codec));
       }
 
       fflush(stdout);

diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index e254ddc..3571323 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk

@@ -9,7 +9,6 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
-
 AV1_COMMON_SRCS-yes += av1_common.mk
 AV1_COMMON_SRCS-yes += av1_iface_common.h
 AV1_COMMON_SRCS-yes += common/alloccommon.c
@@ -29,19 +28,8 @@
 AV1_COMMON_SRCS-yes += common/enums.h
 AV1_COMMON_SRCS-yes += common/filter.h
 AV1_COMMON_SRCS-yes += common/filter.c
-AV1_COMMON_SRCS-yes += common/convolve.c
-AV1_COMMON_SRCS-yes += common/convolve.h
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
-AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_filters_ssse3.h
-ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_filters_sse4.h
-endif
-
 AV1_COMMON_SRCS-yes += common/idct.h
 AV1_COMMON_SRCS-yes += common/idct.c
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.h
-AV1_COMMON_SRCS-yes += common/av1_inv_txfm.c
 AV1_COMMON_SRCS-yes += common/loopfilter.h
 AV1_COMMON_SRCS-yes += common/thread_common.h
 AV1_COMMON_SRCS-yes += common/mv.h
@@ -66,12 +54,34 @@
 AV1_COMMON_SRCS-yes += common/quant_common.c
 AV1_COMMON_SRCS-yes += common/reconinter.c
 AV1_COMMON_SRCS-yes += common/reconintra.c
+AV1_COMMON_SRCS-yes += common/restoration.h
 AV1_COMMON_SRCS-yes += common/common_data.h
 AV1_COMMON_SRCS-yes += common/scan.c
 AV1_COMMON_SRCS-yes += common/scan.h
 # TODO(angiebird) the forward transform belongs under encoder/
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.h
-AV1_COMMON_SRCS-$(CONFIG_AV1_ENCODER) += common/av1_fwd_txfm.c
+AV1_COMMON_SRCS-yes += common/av1_txfm.h
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.h
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm1d.c
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.h
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm1d.c
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d.c
+AV1_COMMON_SRCS-yes += common/av1_fwd_txfm2d_cfg.h
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d.c
+AV1_COMMON_SRCS-yes += common/av1_inv_txfm2d_cfg.h
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_ssse3.c
+AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/av1_convolve_filters_ssse3.h
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_sse4.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_highbd_convolve_filters_sse4.h
+endif
+AV1_COMMON_SRCS-yes += common/convolve.c
+AV1_COMMON_SRCS-yes += common/convolve.h
+AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.h
+AV1_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.c
+ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes))
+AV1_COMMON_SRCS-yes += common/warped_motion.h
+AV1_COMMON_SRCS-yes += common/warped_motion.c
+endif
 ifeq ($(CONFIG_CLPF),yes)
 AV1_COMMON_SRCS-yes += common/clpf.c
 AV1_COMMON_SRCS-yes += common/clpf.h
@@ -126,10 +136,15 @@
 AV1_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct16x16_msa.c
 
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/hybrid_inv_txfm_avx2.c
+
 ifeq ($(CONFIG_AV1_ENCODER),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_dct32x32_impl_sse2.h
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_fwd_txfm_impl_sse2.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_txfm1d_sse4.h
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm1d_sse4.c
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/av1_fwd_txfm2d_sse4.c
+endif
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
 endif
 
 ifneq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
@@ -137,7 +152,8 @@
 AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
 endif
 
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.c
-AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/av1_inv_txfm_sse2.h
+ifeq ($(CONFIG_FILTER_INTRA),yes)
+AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
+endif
 
 $(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))

diff --git a/av1/av1_cx.mk b/av1/av1_cx.mk
index 1dc31a4..35a0447 100644
--- a/av1/av1_cx.mk
+++ b/av1/av1_cx.mk

@@ -9,7 +9,6 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
-
 AV1_CX_EXPORTS += exports_enc
 
 AV1_CX_SRCS-yes += $(AV1_COMMON_SRCS-yes)
@@ -22,6 +21,8 @@
 AV1_CX_SRCS-yes += encoder/bitstream.c
 AV1_CX_SRCS-yes += encoder/context_tree.c
 AV1_CX_SRCS-yes += encoder/context_tree.h
+AV1_CX_SRCS-yes += encoder/variance_tree.c
+AV1_CX_SRCS-yes += encoder/variance_tree.h
 AV1_CX_SRCS-yes += encoder/cost.h
 AV1_CX_SRCS-yes += encoder/cost.c
 AV1_CX_SRCS-yes += encoder/dct.c
@@ -35,6 +36,18 @@
 AV1_CX_SRCS-yes += encoder/ethread.c
 AV1_CX_SRCS-yes += encoder/extend.c
 AV1_CX_SRCS-yes += encoder/firstpass.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/nonmax.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast_9.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += ../third_party/fastfeat/fast.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_match.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/corner_detect.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/global_motion.h
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.c
+AV1_CX_SRCS-$(CONFIG_GLOBAL_MOTION) += encoder/ransac.h
 AV1_CX_SRCS-yes += encoder/block.h
 AV1_CX_SRCS-yes += encoder/bitstream.h
 AV1_CX_SRCS-yes += encoder/encodemb.h
@@ -59,6 +72,8 @@
 endif
 AV1_CX_SRCS-yes += encoder/picklpf.c
 AV1_CX_SRCS-yes += encoder/picklpf.h
+AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.c
+AV1_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.h
 AV1_CX_SRCS-yes += encoder/quantize.c
 AV1_CX_SRCS-yes += encoder/ratectrl.c
 AV1_CX_SRCS-yes += encoder/rd.c
@@ -85,7 +100,9 @@
 AV1_CX_SRCS-yes += encoder/temporal_filter.h
 AV1_CX_SRCS-yes += encoder/mbgraph.c
 AV1_CX_SRCS-yes += encoder/mbgraph.h
-AV1_CX_SRCS-$(CONFIG_DERING) += encoder/pickdering.c
+ifeq ($(CONFIG_DERING),yes)
+AV1_CX_SRCS-yes += encoder/pickdering.c
+endif
 ifeq ($(CONFIG_CLPF),yes)
 AV1_CX_SRCS-yes += encoder/clpf_rdo.c
 AV1_CX_SRCS-yes += encoder/clpf_rdo.h
@@ -95,7 +112,6 @@
 AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/clpf_rdo_sse4_1.c
 AV1_CX_SRCS-$(HAVE_NEON) += encoder/clpf_rdo_neon.c
 endif
-
 ifeq ($(CONFIG_PVQ),yes)
 # PVQ from daala
 AV1_CX_SRCS-yes += encoder/daala_compat_enc.c
@@ -121,6 +137,17 @@
 
 AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
 AV1_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
+AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/hybrid_fwd_txfm_avx2.c
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
+AV1_CX_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
+AV1_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/av1_highbd_quantize_sse4.c
+endif
+
+ifeq ($(CONFIG_EXT_INTER),yes)
+AV1_CX_SRCS-yes += encoder/wedge_utils.c
+AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/wedge_utils_sse2.c
+endif
 
 AV1_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
 

diff --git a/av1/av1_cx_iface.c b/av1/av1_cx_iface.c
index c82cb71..e8069d6 100644
--- a/av1/av1_cx_iface.c
+++ b/av1/av1_cx_iface.c

@@ -15,6 +15,7 @@
 #include "./aom_config.h"
 #include "aom/aom_encoder.h"
 #include "aom_ports/aom_once.h"
+#include "aom_ports/system_state.h"
 #include "aom/internal/aom_codec_internal.h"
 #include "./aom_version.h"
 #include "av1/encoder/encoder.h"
@@ -25,6 +26,9 @@
 struct av1_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
   unsigned int enable_auto_alt_ref;
+#if CONFIG_EXT_REFS
+  unsigned int enable_auto_bwd_ref;
+#endif  // CONFIG_EXT_REFS
   unsigned int noise_sensitivity;
   unsigned int sharpness;
   unsigned int static_thresh;
@@ -54,16 +58,25 @@
   int color_range;
   int render_width;
   int render_height;
+  aom_superblock_size_t superblock_size;
 };
 
 static struct av1_extracfg default_extra_cfg = {
-  0,              // cpu_used
-  1,              // enable_auto_alt_ref
-  0,              // noise_sensitivity
-  0,              // sharpness
-  0,              // static_thresh
-  6,              // tile_columns
-  0,              // tile_rows
+  0,  // cpu_used
+  1,  // enable_auto_alt_ref
+#if CONFIG_EXT_REFS
+  0,    // enable_auto_bwd_ref
+#endif  // CONFIG_EXT_REFS
+  0,    // noise_sensitivity
+  0,    // sharpness
+  0,    // static_thresh
+#if CONFIG_EXT_TILE
+  UINT_MAX,  // tile_columns
+  UINT_MAX,  // tile_rows
+#else
+  0,  // tile_columns
+  0,  // tile_rows
+#endif            // CONFIG_EXT_TILE
   7,              // arnr_max_frames
   5,              // arnr_strength
   0,              // min_gf_interval; 0 -> default decision
@@ -79,15 +92,16 @@
   DEFAULT_QM_FIRST,  // qm_min
   DEFAULT_QM_LAST,   // qm_max
 #endif
-  0,                    // frame_parallel_decoding_mode
-  NO_AQ,                // aq_mode
-  0,                    // frame_periodic_delta_q
-  AOM_BITS_8,           // Bit depth
-  AOM_CONTENT_DEFAULT,  // content
-  AOM_CS_UNKNOWN,       // color space
-  0,                    // color range
-  0,                    // render width
-  0,                    // render height
+  1,                           // frame_parallel_decoding_mode
+  NO_AQ,                       // aq_mode
+  0,                           // frame_periodic_delta_q
+  AOM_BITS_8,                  // Bit depth
+  AOM_CONTENT_DEFAULT,         // content
+  AOM_CS_UNKNOWN,              // color space
+  0,                           // color range
+  0,                           // render width
+  0,                           // render height
+  AOM_SUPERBLOCK_SIZE_DYNAMIC  // superblock_size
 };
 
 struct aom_codec_alg_priv {
@@ -107,21 +121,10 @@
   aom_postproc_cfg_t preview_ppcfg;
   aom_codec_pkt_list_decl(256) pkt_list;
   unsigned int fixed_kf_cntr;
-  aom_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb;
   // BufferPool that holds all reference frames.
   BufferPool *buffer_pool;
 };
 
-static AOM_REFFRAME ref_frame_to_av1_reframe(aom_ref_frame_type_t frame) {
-  switch (frame) {
-    case AOM_LAST_FRAME: return AOM_LAST_FLAG;
-    case AOM_GOLD_FRAME: return AOM_GOLD_FLAG;
-    case AOM_ALTR_FRAME: return AOM_ALT_FLAG;
-  }
-  assert(0 && "Invalid Reference Frame");
-  return AOM_LAST_FLAG;
-}
-
 static aom_codec_err_t update_error_state(
     aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
   const aom_codec_err_t res = error->error_code;
@@ -139,10 +142,10 @@
     return AOM_CODEC_INVALID_PARAM; \
   } while (0)
 
-#define RANGE_CHECK(p, memb, lo, hi)                                 \
-  do {                                                               \
-    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
-      ERROR(#memb " out of range [" #lo ".." #hi "]");               \
+#define RANGE_CHECK(p, memb, lo, hi)                   \
+  do {                                                 \
+    if (!((p)->memb >= (lo) && (p)->memb <= (hi)))     \
+      ERROR(#memb " out of range [" #lo ".." #hi "]"); \
   } while (0)
 
 #define RANGE_CHECK_HI(p, memb, hi)                                     \
@@ -173,7 +176,7 @@
   RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
   RANGE_CHECK_BOOL(extra_cfg, lossless);
   RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 1);
-  RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1);
+  RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
   RANGE_CHECK_HI(cfg, g_threads, 64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
@@ -186,8 +189,8 @@
   RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
   RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
-  RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
-  RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+  RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
   if (extra_cfg->max_gf_interval > 0) {
     RANGE_CHECK(extra_cfg, max_gf_interval, 2, (MAX_LAG_BUFFERS - 1));
   }
@@ -197,8 +200,8 @@
   }
 
   if (cfg->rc_resize_allowed == 1) {
-    RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
-    RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
+    RANGE_CHECK_HI(cfg, rc_scaled_width, cfg->g_w);
+    RANGE_CHECK_HI(cfg, rc_scaled_height, cfg->g_h);
   }
 
   // AV1 does not support a lower bound on the keyframe interval in
@@ -209,15 +212,40 @@
         "kf_min_dist not supported in auto mode, use 0 "
         "or kf_max_dist instead.");
 
-  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+  RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
+#if CONFIG_EXT_REFS
+  RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
+#endif  // CONFIG_EXT_REFS
   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
-  RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
-  RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
+  RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
+              AOM_SUPERBLOCK_SIZE_DYNAMIC);
+#if CONFIG_EXT_TILE
+// TODO(any): Waring. If CONFIG_EXT_TILE is true, tile_columns really
+// means tile_width, and tile_rows really means tile_hight. The interface
+// should be sanitized.
+#if CONFIG_EXT_PARTITION
+  if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 32);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 32);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 64);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
+  }
+#else
+  RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+  RANGE_CHECK_HI(extra_cfg, tile_rows, 2);
+#endif  // CONFIG_EXT_TILE
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
-  RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
+  RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
-  RANGE_CHECK(extra_cfg, cq_level, 0, 63);
+  RANGE_CHECK_HI(extra_cfg, cq_level, 63);
   RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
   RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
   RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
@@ -398,6 +426,9 @@
 
   oxcf->speed = abs(extra_cfg->cpu_used);
   oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+#if CONFIG_EXT_REFS
+  oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+#endif  // CONFIG_EXT_REFS
   oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
   oxcf->sharpness = extra_cfg->sharpness;
 
@@ -419,8 +450,25 @@
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
 
+#if CONFIG_EXT_PARTITION
+  oxcf->superblock_size = extra_cfg->superblock_size;
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_TILE
+  {
+#if CONFIG_EXT_PARTITION
+    const unsigned int max =
+        extra_cfg->superblock_size == AOM_SUPERBLOCK_SIZE_64X64 ? 64 : 32;
+#else
+    const unsigned int max = 64;
+#endif  // CONFIG_EXT_PARTITION
+    oxcf->tile_columns = AOMMIN(extra_cfg->tile_columns, max);
+    oxcf->tile_rows = AOMMIN(extra_cfg->tile_rows, max);
+  }
+#else
   oxcf->tile_columns = extra_cfg->tile_columns;
   oxcf->tile_rows = extra_cfg->tile_rows;
+#endif  // CONFIG_EXT_TILE
 
   oxcf->error_resilient_mode = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
@@ -540,6 +588,15 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+#if CONFIG_EXT_REFS
+static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif  // CONFIG_EXT_REFS
+
 static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
                                                   va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -787,7 +844,7 @@
   uint8_t marker = 0xc0;
   unsigned int mask;
   int mag, index_sz;
-  int i, j;
+  int i;
   size_t max_frame_sz = 0;
 
   assert(ctx->pending_frame_count);
@@ -830,6 +887,7 @@
     *x++ = marker;
     for (i = 0; i < ctx->pending_frame_count - 1; i++) {
       unsigned int this_sz;
+      int j;
 
       assert(ctx->pending_frame_sizes[i] > 0);
       this_sz = (unsigned int)ctx->pending_frame_sizes[i] - 1;
@@ -877,18 +935,21 @@
                                       const aom_image_t *img,
                                       aom_codec_pts_t pts,
                                       unsigned long duration,
-                                      aom_enc_frame_flags_t flags,
+                                      aom_enc_frame_flags_t enc_flags,
                                       unsigned long deadline) {
-  aom_codec_err_t res = AOM_CODEC_OK;
+  volatile aom_codec_err_t res = AOM_CODEC_OK;
+  volatile aom_enc_frame_flags_t flags = enc_flags;
   AV1_COMP *const cpi = ctx->cpi;
   const aom_rational_t *const timebase = &ctx->cfg.g_timebase;
   size_t data_sz;
 
+  if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+
   if (img != NULL) {
     res = validate_img(ctx, img);
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
-    if (res == AOM_CODEC_OK && cpi != NULL) {
+    if (res == AOM_CODEC_OK) {
 #if CONFIG_EXT_REFS
       data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
                 ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
@@ -921,6 +982,14 @@
     return AOM_CODEC_INVALID_PARAM;
   }
 
+  if (setjmp(cpi->common.error.jmp)) {
+    cpi->common.error.setjmp = 0;
+    res = update_error_state(ctx, &cpi->common.error);
+    aom_clear_system_state();
+    return res;
+  }
+  cpi->common.error.setjmp = 1;
+
   av1_apply_encoding_flags(cpi, flags);
 
   // Handle fixed keyframe intervals
@@ -932,8 +1001,7 @@
     }
   }
 
-  // Initialize the encoder instance on the first frame.
-  if (res == AOM_CODEC_OK && cpi != NULL) {
+  if (res == AOM_CODEC_OK) {
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
@@ -971,7 +1039,8 @@
        * the buffer size anyway.
        */
       if (cx_data_sz < ctx->cx_data_sz / 2) {
-        ctx->base.err_detail = "Compressed data buffer too small";
+        aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+                           "Compressed data buffer too small");
         return AOM_CODEC_ERROR;
       }
     }
@@ -991,21 +1060,6 @@
           cx_data += size;
           cx_data_sz -= size;
 
-          if (ctx->output_cx_pkt_cb.output_cx_pkt) {
-            pkt.kind = AOM_CODEC_CX_FRAME_PKT;
-            pkt.data.frame.pts =
-                ticks_to_timebase_units(timebase, dst_time_stamp);
-            pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-                timebase, dst_end_time_stamp - dst_time_stamp);
-            pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
-            pkt.data.frame.buf = ctx->pending_cx_data;
-            pkt.data.frame.sz = size;
-            ctx->pending_cx_data = NULL;
-            ctx->pending_cx_data_sz = 0;
-            ctx->pending_frame_count = 0;
-            ctx->output_cx_pkt_cb.output_cx_pkt(
-                &pkt, ctx->output_cx_pkt_cb.user_priv);
-          }
           continue;
         }
 
@@ -1019,9 +1073,7 @@
         if (ctx->pending_cx_data) {
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
           ctx->pending_cx_data_sz += size;
-          // write the superframe only for the case when
-          if (!ctx->output_cx_pkt_cb.output_cx_pkt)
-            size += write_superframe_index(ctx);
+          size += write_superframe_index(ctx);
           pkt.data.frame.buf = ctx->pending_cx_data;
           pkt.data.frame.sz = ctx->pending_cx_data_sz;
           ctx->pending_cx_data = NULL;
@@ -1033,11 +1085,7 @@
         }
         pkt.data.frame.partition_id = -1;
 
-        if (ctx->output_cx_pkt_cb.output_cx_pkt)
-          ctx->output_cx_pkt_cb.output_cx_pkt(&pkt,
-                                              ctx->output_cx_pkt_cb.user_priv);
-        else
-          aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+        aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
 
         cx_data += size;
         cx_data_sz -= size;
@@ -1045,6 +1093,7 @@
     }
   }
 
+  cpi->common.error.setjmp = 0;
   return res;
 }
 
@@ -1100,6 +1149,24 @@
   }
 }
 
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
 static aom_codec_err_t ctrl_set_previewpp(aom_codec_alg_priv_t *ctx,
                                           va_list args) {
   (void)ctx;
@@ -1118,6 +1185,14 @@
   }
 }
 
+static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  const int reference_flag = va_arg(args, int);
+
+  av1_use_as_reference(ctx->cpi, reference_flag);
+  return AOM_CODEC_OK;
+}
+
 static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx,
                                         va_list args) {
   (void)ctx;
@@ -1171,16 +1246,6 @@
   }
 }
 
-static aom_codec_err_t ctrl_register_cx_callback(aom_codec_alg_priv_t *ctx,
-                                                 va_list args) {
-  aom_codec_priv_output_cx_pkt_cb_pair_t *cbp =
-      (aom_codec_priv_output_cx_pkt_cb_pair_t *)va_arg(args, void *);
-  ctx->output_cx_pkt_cb.output_cx_pkt = cbp->output_cx_pkt;
-  ctx->output_cx_pkt_cb.user_priv = cbp->user_priv;
-
-  return AOM_CODEC_OK;
-}
-
 static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1211,8 +1276,16 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct av1_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { AOM_COPY_REFERENCE, ctrl_copy_reference },
+  { AOME_USE_REFERENCE, ctrl_use_reference },
 
   // Setters
   { AOM_SET_REFERENCE, ctrl_set_reference },
@@ -1222,6 +1295,9 @@
   { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
   { AOME_SET_CPUUSED, ctrl_set_cpuused },
   { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
+#if CONFIG_EXT_REFS
+  { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
+#endif  // CONFIG_EXT_REFS
   { AOME_SET_SHARPNESS, ctrl_set_sharpness },
   { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
   { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
@@ -1243,7 +1319,6 @@
   { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
   { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
   { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
-  { AV1E_REGISTER_CX_CALLBACK, ctrl_register_cx_callback },
   { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
   { AV1E_SET_COLOR_SPACE, ctrl_set_color_space },
   { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
@@ -1251,12 +1326,14 @@
   { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
   { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
   { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
+  { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
 
   // Getters
   { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
   { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
   { AV1_GET_REFERENCE, ctrl_get_reference },
   { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
 
   { -1, NULL },
 };
@@ -1309,7 +1386,7 @@
         // keyframing settings (kf)
         AOM_KF_AUTO,  // g_kfmode
         0,            // kf_min_dist
-        9999          // kf_max_dist
+        9999,         // kf_max_dist
     } },
 };
 

diff --git a/av1/av1_dx.mk b/av1/av1_dx.mk
index 24decc7..81f526c 100644
--- a/av1/av1_dx.mk
+++ b/av1/av1_dx.mk

@@ -9,7 +9,6 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
-
 AV1_DX_EXPORTS += exports_dec
 
 AV1_DX_SRCS-yes += $(AV1_COMMON_SRCS-yes)

diff --git a/av1/av1_dx_iface.c b/av1/av1_dx_iface.c
index fd313da..2caed90 100644
--- a/av1/av1_dx_iface.c
+++ b/av1/av1_dx_iface.c

@@ -24,6 +24,7 @@
 
 #include "av1/common/alloccommon.h"
 #include "av1/common/frame_buffers.h"
+#include "av1/common/enums.h"
 
 #include "av1/decoder/decoder.h"
 #include "av1/decoder/decodeframe.h"
@@ -56,6 +57,8 @@
   int last_show_frame;  // Index of last output frame.
   int byte_alignment;
   int skip_loop_filter;
+  int decode_tile_row;
+  int decode_tile_col;
 
   // Frame parallel related.
   int frame_parallel_decode;  // frame-based threading.
@@ -119,6 +122,9 @@
           (FrameWorkerData *)worker->data1;
       aom_get_worker_interface()->end(worker);
       av1_remove_common(&frame_worker_data->pbi->common);
+#if CONFIG_LOOP_RESTORATION
+      av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+#endif  // CONFIG_LOOP_RESTORATION
       av1_decoder_remove(frame_worker_data->pbi);
       aom_free(frame_worker_data->scratch_buffer);
 #if CONFIG_MULTITHREAD
@@ -467,6 +473,11 @@
     frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
     frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
 
+#if CONFIG_EXT_TILE
+    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+#endif  // CONFIG_EXT_TILE
+
     worker->had_error = 0;
     winterface->execute(worker);
 
@@ -726,6 +737,38 @@
           ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
           if (ctx->need_resync) return NULL;
           yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+
+#if CONFIG_EXT_TILE
+          if (frame_worker_data->pbi->dec_tile_row >= 0) {
+            const int tile_row =
+                AOMMIN(frame_worker_data->pbi->dec_tile_row, cm->tile_rows - 1);
+            const int mi_row = tile_row * cm->tile_height;
+            const int ssy = ctx->img.y_chroma_shift;
+            int plane;
+            ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] +=
+                  mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+            }
+            ctx->img.d_h =
+                AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
+          }
+
+          if (frame_worker_data->pbi->dec_tile_col >= 0) {
+            const int tile_col =
+                AOMMIN(frame_worker_data->pbi->dec_tile_col, cm->tile_cols - 1);
+            const int mi_col = tile_col * cm->tile_width;
+            const int ssx = ctx->img.x_chroma_shift;
+            int plane;
+            ctx->img.planes[0] += mi_col * MI_SIZE;
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+            }
+            ctx->img.d_w =
+                AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
+          }
+#endif  // CONFIG_EXT_TILE
+
           ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
           img = &ctx->img;
           return img;
@@ -776,7 +819,8 @@
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return av1_set_reference_dec(&frame_worker_data->pbi->common,
-                                 (AOM_REFFRAME)frame->frame_type, &sd);
+                                 ref_frame_to_av1_reframe(frame->frame_type),
+                                 &sd);
   } else {
     return AOM_CODEC_INVALID_PARAM;
   }
@@ -827,6 +871,32 @@
   }
 }
 
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  aom_image_t *new_img = va_arg(args, aom_image_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return AOM_CODEC_INCAPABLE;
+  }
+
+  if (new_img) {
+    YV12_BUFFER_CONFIG new_frame;
+    AVxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return AOM_CODEC_OK;
+    } else {
+      return AOM_CODEC_ERROR;
+    }
+  } else {
+    return AOM_CODEC_INVALID_PARAM;
+  }
+}
+
 static aom_codec_err_t ctrl_set_postproc(aom_codec_alg_priv_t *ctx,
                                          va_list args) {
   (void)ctx;
@@ -1031,6 +1101,17 @@
   return AOM_CODEC_ERROR;
 #endif
 }
+static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_row = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_col = va_arg(args, int);
+  return AOM_CODEC_OK;
+}
 
 static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { AOM_COPY_REFERENCE, ctrl_copy_reference },
@@ -1046,6 +1127,8 @@
   { AOMD_SET_DECRYPTOR, ctrl_set_decryptor },
   { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+  { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
+  { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
 
   // Getters
   { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
@@ -1055,6 +1138,7 @@
   { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
   { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
   { AV1_GET_ACCOUNTING, ctrl_get_accounting },
+  { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
 
   { -1, NULL },
 };

diff --git a/av1/av1_iface_common.h b/av1/av1_iface_common.h
index 0000f42..3e6024c 100644
--- a/av1/av1_iface_common.h
+++ b/av1/av1_iface_common.h

@@ -134,4 +134,13 @@
   return AOM_CODEC_OK;
 }
 
+static AOM_REFFRAME ref_frame_to_av1_reframe(aom_ref_frame_type_t frame) {
+  switch (frame) {
+    case AOM_LAST_FRAME: return AOM_LAST_FLAG;
+    case AOM_GOLD_FRAME: return AOM_GOLD_FLAG;
+    case AOM_ALTR_FRAME: return AOM_ALT_FLAG;
+  }
+  assert(0 && "Invalid Reference Frame");
+  return AOM_LAST_FLAG;
+}
 #endif  // AV1_AV1_IFACE_COMMON_H_

diff --git a/av1/common/alloccommon.c b/av1/common/alloccommon.c
index 79264df..7c5b358 100644
--- a/av1/common/alloccommon.c
+++ b/av1/common/alloccommon.c

@@ -81,13 +81,31 @@
   }
 }
 
+#if CONFIG_LOOP_RESTORATION
+void av1_free_restoration_buffers(AV1_COMMON *cm) {
+  aom_free(cm->rst_info.restoration_type);
+  cm->rst_info.restoration_type = NULL;
+  aom_free(cm->rst_info.bilateral_info);
+  cm->rst_info.bilateral_info = NULL;
+  aom_free(cm->rst_info.wiener_info);
+  cm->rst_info.wiener_info = NULL;
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
 void av1_free_context_buffers(AV1_COMMON *cm) {
+  int i;
   cm->free_mi(cm);
   free_seg_map(cm);
-  aom_free(cm->above_context);
-  cm->above_context = NULL;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    aom_free(cm->above_context[i]);
+    cm->above_context[i] = NULL;
+  }
   aom_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+#if CONFIG_VAR_TX
+  aom_free(cm->above_txfm_context);
+  cm->above_txfm_context = NULL;
+#endif
 }
 
 int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
@@ -107,22 +125,40 @@
   }
 
   if (cm->above_context_alloc_cols < cm->mi_cols) {
-    aom_free(cm->above_context);
-    cm->above_context = (ENTROPY_CONTEXT *)aom_calloc(
-        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
-        sizeof(*cm->above_context));
-    if (!cm->above_context) goto fail;
+    // TODO(geza.lore): These are bigger than they need to be.
+    // cm->tile_width would be enough but it complicates indexing a
+    // little elsewhere.
+    const int aligned_mi_cols =
+        ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+    int i;
+
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      aom_free(cm->above_context[i]);
+      cm->above_context[i] = (ENTROPY_CONTEXT *)aom_calloc(
+          2 * aligned_mi_cols, sizeof(*cm->above_context[0]));
+      if (!cm->above_context[i]) goto fail;
+    }
 
     aom_free(cm->above_seg_context);
     cm->above_seg_context = (PARTITION_CONTEXT *)aom_calloc(
-        mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+        aligned_mi_cols, sizeof(*cm->above_seg_context));
     if (!cm->above_seg_context) goto fail;
-    cm->above_context_alloc_cols = cm->mi_cols;
+
+#if CONFIG_VAR_TX
+    aom_free(cm->above_txfm_context);
+    cm->above_txfm_context = (TXFM_CONTEXT *)aom_calloc(
+        aligned_mi_cols, sizeof(*cm->above_txfm_context));
+    if (!cm->above_txfm_context) goto fail;
+#endif
+
+    cm->above_context_alloc_cols = aligned_mi_cols;
   }
 
   return 0;
 
 fail:
+  // clear the mi_* values to force a realloc on resync
+  av1_set_mb_mi(cm, 0, 0);
   av1_free_context_buffers(cm);
   return 1;
 }

diff --git a/av1/common/alloccommon.h b/av1/common/alloccommon.h
index bbce0ad..0a0c38c 100644
--- a/av1/common/alloccommon.h
+++ b/av1/common/alloccommon.h

@@ -28,6 +28,9 @@
 void av1_free_context_buffers(struct AV1Common *cm);
 
 void av1_free_ref_frame_buffers(struct BufferPool *pool);
+#if CONFIG_LOOP_RESTORATION
+void av1_free_restoration_buffers(struct AV1Common *cm);
+#endif  // CONFIG_LOOP_RESTORATION
 
 int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
 void av1_free_state_buffers(struct AV1Common *cm);

diff --git a/av1/common/av1_fwd_txfm.c b/av1/common/av1_fwd_txfm.c
deleted file mode 100644
index 9248bfd..0000000
--- a/av1/common/av1_fwd_txfm.c
+++ /dev/null

@@ -1,812 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "av1/common/av1_fwd_txfm.h"
-
-void av1_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[4 * 4];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t in_high[4];    // canbe16
-    tran_high_t step[4];       // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 4; ++i) {
-      // Load inputs.
-      if (0 == pass) {
-        in_high[0] = input[0 * stride] * 16;
-        in_high[1] = input[1 * stride] * 16;
-        in_high[2] = input[2 * stride] * 16;
-        in_high[3] = input[3 * stride] * 16;
-        if (i == 0 && in_high[0]) {
-          in_high[0] += 1;
-        }
-      } else {
-        assert(in_low != NULL);
-        in_high[0] = in_low[0 * 4];
-        in_high[1] = in_low[1 * 4];
-        in_high[2] = in_low[2 * 4];
-        in_high[3] = in_low[3 * 4];
-        in_low++;
-      }
-      // Transform.
-      step[0] = in_high[0] + in_high[3];
-      step[1] = in_high[1] + in_high[2];
-      step[2] = in_high[1] - in_high[2];
-      step[3] = in_high[0] - in_high[3];
-      temp1 = (step[0] + step[1]) * cospi_16_64;
-      temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = (tran_low_t)fdct_round_shift(temp1);
-      out[2] = (tran_low_t)fdct_round_shift(temp2);
-      temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
-      temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = (tran_low_t)fdct_round_shift(temp1);
-      out[3] = (tran_low_t)fdct_round_shift(temp2);
-      // Do next column (which is a transposed row in second/horizontal pass)
-      input++;
-      out += 4;
-    }
-    // Setup in_low/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-
-  {
-    int i, j;
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
-    }
-  }
-}
-
-void av1_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 4; ++r)
-    for (c = 0; c < 4; ++c) sum += input[r * stride + c];
-
-  output[0] = sum << 1;
-  output[1] = 0;
-}
-
-void av1_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
-  int i, j;
-  tran_low_t intermediate[64];
-  int pass;
-  tran_low_t *output = intermediate;
-  const tran_low_t *in = NULL;
-
-  // Transform columns
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      if (pass == 0) {
-        s0 = (input[0 * stride] + input[7 * stride]) * 4;
-        s1 = (input[1 * stride] + input[6 * stride]) * 4;
-        s2 = (input[2 * stride] + input[5 * stride]) * 4;
-        s3 = (input[3 * stride] + input[4 * stride]) * 4;
-        s4 = (input[3 * stride] - input[4 * stride]) * 4;
-        s5 = (input[2 * stride] - input[5 * stride]) * 4;
-        s6 = (input[1 * stride] - input[6 * stride]) * 4;
-        s7 = (input[0 * stride] - input[7 * stride]) * 4;
-        ++input;
-      } else {
-        s0 = in[0 * 8] + in[7 * 8];
-        s1 = in[1 * 8] + in[6 * 8];
-        s2 = in[2 * 8] + in[5 * 8];
-        s3 = in[3 * 8] + in[4 * 8];
-        s4 = in[3 * 8] - in[4 * 8];
-        s5 = in[2 * 8] - in[5 * 8];
-        s6 = in[1 * 8] - in[6 * 8];
-        s7 = in[0 * 8] - in[7 * 8];
-        ++in;
-      }
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0] = (tran_low_t)fdct_round_shift(t0);
-      output[2] = (tran_low_t)fdct_round_shift(t2);
-      output[4] = (tran_low_t)fdct_round_shift(t1);
-      output[6] = (tran_low_t)fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1] = (tran_low_t)fdct_round_shift(t0);
-      output[3] = (tran_low_t)fdct_round_shift(t2);
-      output[5] = (tran_low_t)fdct_round_shift(t1);
-      output[7] = (tran_low_t)fdct_round_shift(t3);
-      output += 8;
-    }
-    in = intermediate;
-    output = final_output;
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
-  }
-}
-
-void av1_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 8; ++r)
-    for (c = 0; c < 8; ++c) sum += input[r * stride + c];
-
-  output[0] = sum;
-  output[1] = 0;
-}
-
-void av1_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[256];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t step1[8];      // canbe16
-    tran_high_t step2[8];      // canbe16
-    tran_high_t step3[8];      // canbe16
-    tran_high_t in_high[8];    // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 16; i++) {
-      if (0 == pass) {
-        // Calculate input for the first 8 results.
-        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
-        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
-        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
-        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
-        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
-        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
-        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
-        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
-        // Calculate input for the next 8 results.
-        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
-        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
-        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
-        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
-        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
-        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
-        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
-        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
-      } else {
-        // Calculate input for the first 8 results.
-        assert(in_low != NULL);
-        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
-        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
-        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
-        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
-        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
-        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
-        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
-        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
-        // Calculate input for the next 8 results.
-        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
-        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
-        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
-        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
-        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
-        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
-        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
-        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
-        in_low++;
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-        tran_high_t t0, t1, t2, t3;                  // needs32
-        tran_high_t x0, x1, x2, x3;                  // canbe16
-
-        // stage 1
-        s0 = in_high[0] + in_high[7];
-        s1 = in_high[1] + in_high[6];
-        s2 = in_high[2] + in_high[5];
-        s3 = in_high[3] + in_high[4];
-        s4 = in_high[3] - in_high[4];
-        s5 = in_high[2] - in_high[5];
-        s6 = in_high[1] - in_high[6];
-        s7 = in_high[0] - in_high[7];
-
-        // fdct4(step, step);
-        x0 = s0 + s3;
-        x1 = s1 + s2;
-        x2 = s1 - s2;
-        x3 = s0 - s3;
-        t0 = (x0 + x1) * cospi_16_64;
-        t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
-        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = (tran_low_t)fdct_round_shift(t0);
-        out[4] = (tran_low_t)fdct_round_shift(t2);
-        out[8] = (tran_low_t)fdct_round_shift(t1);
-        out[12] = (tran_low_t)fdct_round_shift(t3);
-
-        // Stage 2
-        t0 = (s6 - s5) * cospi_16_64;
-        t1 = (s6 + s5) * cospi_16_64;
-        t2 = fdct_round_shift(t0);
-        t3 = fdct_round_shift(t1);
-
-        // Stage 3
-        x0 = s4 + t2;
-        x1 = s4 - t2;
-        x2 = s7 - t3;
-        x3 = s7 + t3;
-
-        // Stage 4
-        t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-        out[2] = (tran_low_t)fdct_round_shift(t0);
-        out[6] = (tran_low_t)fdct_round_shift(t2);
-        out[10] = (tran_low_t)fdct_round_shift(t1);
-        out[14] = (tran_low_t)fdct_round_shift(t3);
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        temp1 = (step1[5] - step1[2]) * cospi_16_64;
-        temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = fdct_round_shift(temp1);
-        step2[3] = fdct_round_shift(temp2);
-        temp1 = (step1[4] + step1[3]) * cospi_16_64;
-        temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = fdct_round_shift(temp1);
-        step2[5] = fdct_round_shift(temp2);
-        // step 3
-        step3[0] = step1[0] + step2[3];
-        step3[1] = step1[1] + step2[2];
-        step3[2] = step1[1] - step2[2];
-        step3[3] = step1[0] - step2[3];
-        step3[4] = step1[7] - step2[4];
-        step3[5] = step1[6] - step2[5];
-        step3[6] = step1[6] + step2[5];
-        step3[7] = step1[7] + step2[4];
-        // step 4
-        temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
-        step2[1] = fdct_round_shift(temp1);
-        step2[2] = fdct_round_shift(temp2);
-        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
-        step2[5] = fdct_round_shift(temp1);
-        step2[6] = fdct_round_shift(temp2);
-        // step 5
-        step1[0] = step3[0] + step2[1];
-        step1[1] = step3[0] - step2[1];
-        step1[2] = step3[3] + step2[2];
-        step1[3] = step3[3] - step2[2];
-        step1[4] = step3[4] - step2[5];
-        step1[5] = step3[4] + step2[5];
-        step1[6] = step3[7] - step2[6];
-        step1[7] = step3[7] + step2[6];
-        // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = (tran_low_t)fdct_round_shift(temp1);
-        out[9] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = (tran_low_t)fdct_round_shift(temp1);
-        out[13] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = (tran_low_t)fdct_round_shift(temp1);
-        out[11] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = (tran_low_t)fdct_round_shift(temp1);
-        out[15] = (tran_low_t)fdct_round_shift(temp2);
-      }
-      // Do next column (which is a transposed row in second/horizontal pass)
-      input++;
-      out += 16;
-    }
-    // Setup in/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-}
-
-void av1_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 16; ++r)
-    for (c = 0; c < 16; ++c) sum += input[r * stride + c];
-
-  output[0] = sum >> 1;
-  output[1] = 0;
-}
-
-static INLINE tran_high_t dct_32_round(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
-  // and make the bounds consts.
-  // assert(-131072 <= rv && rv <= 131071);
-  return rv;
-}
-
-static INLINE tran_high_t half_round_shift(tran_high_t input) {
-  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
-  return rv;
-}
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
-  tran_high_t step[32];
-  // Stage 1
-  step[0] = input[0] + input[(32 - 1)];
-  step[1] = input[1] + input[(32 - 2)];
-  step[2] = input[2] + input[(32 - 3)];
-  step[3] = input[3] + input[(32 - 4)];
-  step[4] = input[4] + input[(32 - 5)];
-  step[5] = input[5] + input[(32 - 6)];
-  step[6] = input[6] + input[(32 - 7)];
-  step[7] = input[7] + input[(32 - 8)];
-  step[8] = input[8] + input[(32 - 9)];
-  step[9] = input[9] + input[(32 - 10)];
-  step[10] = input[10] + input[(32 - 11)];
-  step[11] = input[11] + input[(32 - 12)];
-  step[12] = input[12] + input[(32 - 13)];
-  step[13] = input[13] + input[(32 - 14)];
-  step[14] = input[14] + input[(32 - 15)];
-  step[15] = input[15] + input[(32 - 16)];
-  step[16] = -input[16] + input[(32 - 17)];
-  step[17] = -input[17] + input[(32 - 18)];
-  step[18] = -input[18] + input[(32 - 19)];
-  step[19] = -input[19] + input[(32 - 20)];
-  step[20] = -input[20] + input[(32 - 21)];
-  step[21] = -input[21] + input[(32 - 22)];
-  step[22] = -input[22] + input[(32 - 23)];
-  step[23] = -input[23] + input[(32 - 24)];
-  step[24] = -input[24] + input[(32 - 25)];
-  step[25] = -input[25] + input[(32 - 26)];
-  step[26] = -input[26] + input[(32 - 27)];
-  step[27] = -input[27] + input[(32 - 28)];
-  step[28] = -input[28] + input[(32 - 29)];
-  step[29] = -input[29] + input[(32 - 30)];
-  step[30] = -input[30] + input[(32 - 31)];
-  step[31] = -input[31] + input[(32 - 32)];
-
-  // Stage 2
-  output[0] = step[0] + step[16 - 1];
-  output[1] = step[1] + step[16 - 2];
-  output[2] = step[2] + step[16 - 3];
-  output[3] = step[3] + step[16 - 4];
-  output[4] = step[4] + step[16 - 5];
-  output[5] = step[5] + step[16 - 6];
-  output[6] = step[6] + step[16 - 7];
-  output[7] = step[7] + step[16 - 8];
-  output[8] = -step[8] + step[16 - 9];
-  output[9] = -step[9] + step[16 - 10];
-  output[10] = -step[10] + step[16 - 11];
-  output[11] = -step[11] + step[16 - 12];
-  output[12] = -step[12] + step[16 - 13];
-  output[13] = -step[13] + step[16 - 14];
-  output[14] = -step[14] + step[16 - 15];
-  output[15] = -step[15] + step[16 - 16];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = step[18];
-  output[19] = step[19];
-
-  output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64);
-  output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64);
-  output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64);
-  output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64);
-
-  output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64);
-  output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64);
-  output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64);
-  output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64);
-
-  output[28] = step[28];
-  output[29] = step[29];
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // dump the magnitude by 4, hence the intermediate values are within
-  // the range of 16 bits.
-  if (round) {
-    output[0] = half_round_shift(output[0]);
-    output[1] = half_round_shift(output[1]);
-    output[2] = half_round_shift(output[2]);
-    output[3] = half_round_shift(output[3]);
-    output[4] = half_round_shift(output[4]);
-    output[5] = half_round_shift(output[5]);
-    output[6] = half_round_shift(output[6]);
-    output[7] = half_round_shift(output[7]);
-    output[8] = half_round_shift(output[8]);
-    output[9] = half_round_shift(output[9]);
-    output[10] = half_round_shift(output[10]);
-    output[11] = half_round_shift(output[11]);
-    output[12] = half_round_shift(output[12]);
-    output[13] = half_round_shift(output[13]);
-    output[14] = half_round_shift(output[14]);
-    output[15] = half_round_shift(output[15]);
-
-    output[16] = half_round_shift(output[16]);
-    output[17] = half_round_shift(output[17]);
-    output[18] = half_round_shift(output[18]);
-    output[19] = half_round_shift(output[19]);
-    output[20] = half_round_shift(output[20]);
-    output[21] = half_round_shift(output[21]);
-    output[22] = half_round_shift(output[22]);
-    output[23] = half_round_shift(output[23]);
-    output[24] = half_round_shift(output[24]);
-    output[25] = half_round_shift(output[25]);
-    output[26] = half_round_shift(output[26]);
-    output[27] = half_round_shift(output[27]);
-    output[28] = half_round_shift(output[28]);
-    output[29] = half_round_shift(output[29]);
-    output[30] = half_round_shift(output[30]);
-    output[31] = half_round_shift(output[31]);
-  }
-
-  // Stage 3
-  step[0] = output[0] + output[(8 - 1)];
-  step[1] = output[1] + output[(8 - 2)];
-  step[2] = output[2] + output[(8 - 3)];
-  step[3] = output[3] + output[(8 - 4)];
-  step[4] = -output[4] + output[(8 - 5)];
-  step[5] = -output[5] + output[(8 - 6)];
-  step[6] = -output[6] + output[(8 - 7)];
-  step[7] = -output[7] + output[(8 - 8)];
-  step[8] = output[8];
-  step[9] = output[9];
-  step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64);
-  step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64);
-  step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64);
-  step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64);
-  step[14] = output[14];
-  step[15] = output[15];
-
-  step[16] = output[16] + output[23];
-  step[17] = output[17] + output[22];
-  step[18] = output[18] + output[21];
-  step[19] = output[19] + output[20];
-  step[20] = -output[20] + output[19];
-  step[21] = -output[21] + output[18];
-  step[22] = -output[22] + output[17];
-  step[23] = -output[23] + output[16];
-  step[24] = -output[24] + output[31];
-  step[25] = -output[25] + output[30];
-  step[26] = -output[26] + output[29];
-  step[27] = -output[27] + output[28];
-  step[28] = output[28] + output[27];
-  step[29] = output[29] + output[26];
-  step[30] = output[30] + output[25];
-  step[31] = output[31] + output[24];
-
-  // Stage 4
-  output[0] = step[0] + step[3];
-  output[1] = step[1] + step[2];
-  output[2] = -step[2] + step[1];
-  output[3] = -step[3] + step[0];
-  output[4] = step[4];
-  output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64);
-  output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64);
-  output[7] = step[7];
-  output[8] = step[8] + step[11];
-  output[9] = step[9] + step[10];
-  output[10] = -step[10] + step[9];
-  output[11] = -step[11] + step[8];
-  output[12] = -step[12] + step[15];
-  output[13] = -step[13] + step[14];
-  output[14] = step[14] + step[13];
-  output[15] = step[15] + step[12];
-
-  output[16] = step[16];
-  output[17] = step[17];
-  output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64);
-  output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64);
-  output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64);
-  output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64);
-  output[22] = step[22];
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = step[25];
-  output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64);
-  output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64);
-  output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64);
-  output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64);
-  output[30] = step[30];
-  output[31] = step[31];
-
-  // Stage 5
-  step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64);
-  step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64);
-  step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64);
-  step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64);
-  step[4] = output[4] + output[5];
-  step[5] = -output[5] + output[4];
-  step[6] = -output[6] + output[7];
-  step[7] = output[7] + output[6];
-  step[8] = output[8];
-  step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64);
-  step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64);
-  step[11] = output[11];
-  step[12] = output[12];
-  step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64);
-  step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64);
-  step[15] = output[15];
-
-  step[16] = output[16] + output[19];
-  step[17] = output[17] + output[18];
-  step[18] = -output[18] + output[17];
-  step[19] = -output[19] + output[16];
-  step[20] = -output[20] + output[23];
-  step[21] = -output[21] + output[22];
-  step[22] = output[22] + output[21];
-  step[23] = output[23] + output[20];
-  step[24] = output[24] + output[27];
-  step[25] = output[25] + output[26];
-  step[26] = -output[26] + output[25];
-  step[27] = -output[27] + output[24];
-  step[28] = -output[28] + output[31];
-  step[29] = -output[29] + output[30];
-  step[30] = output[30] + output[29];
-  step[31] = output[31] + output[28];
-
-  // Stage 6
-  output[0] = step[0];
-  output[1] = step[1];
-  output[2] = step[2];
-  output[3] = step[3];
-  output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64);
-  output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64);
-  output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64);
-  output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64);
-  output[8] = step[8] + step[9];
-  output[9] = -step[9] + step[8];
-  output[10] = -step[10] + step[11];
-  output[11] = step[11] + step[10];
-  output[12] = step[12] + step[13];
-  output[13] = -step[13] + step[12];
-  output[14] = -step[14] + step[15];
-  output[15] = step[15] + step[14];
-
-  output[16] = step[16];
-  output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64);
-  output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64);
-  output[19] = step[19];
-  output[20] = step[20];
-  output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64);
-  output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64);
-  output[23] = step[23];
-  output[24] = step[24];
-  output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64);
-  output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64);
-  output[27] = step[27];
-  output[28] = step[28];
-  output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64);
-  output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64);
-  output[31] = step[31];
-
-  // Stage 7
-  step[0] = output[0];
-  step[1] = output[1];
-  step[2] = output[2];
-  step[3] = output[3];
-  step[4] = output[4];
-  step[5] = output[5];
-  step[6] = output[6];
-  step[7] = output[7];
-  step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64);
-  step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64);
-  step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64);
-  step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64);
-  step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64);
-  step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64);
-  step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64);
-  step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64);
-
-  step[16] = output[16] + output[17];
-  step[17] = -output[17] + output[16];
-  step[18] = -output[18] + output[19];
-  step[19] = output[19] + output[18];
-  step[20] = output[20] + output[21];
-  step[21] = -output[21] + output[20];
-  step[22] = -output[22] + output[23];
-  step[23] = output[23] + output[22];
-  step[24] = output[24] + output[25];
-  step[25] = -output[25] + output[24];
-  step[26] = -output[26] + output[27];
-  step[27] = output[27] + output[26];
-  step[28] = output[28] + output[29];
-  step[29] = -output[29] + output[28];
-  step[30] = -output[30] + output[31];
-  step[31] = output[31] + output[30];
-
-  // Final stage --- outputs indices are bit-reversed.
-  output[0] = step[0];
-  output[16] = step[1];
-  output[8] = step[2];
-  output[24] = step[3];
-  output[4] = step[4];
-  output[20] = step[5];
-  output[12] = step[6];
-  output[28] = step[7];
-  output[2] = step[8];
-  output[18] = step[9];
-  output[10] = step[10];
-  output[26] = step[11];
-  output[6] = step[12];
-  output[22] = step[13];
-  output[14] = step[14];
-  output[30] = step[15];
-
-  output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64);
-  output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64);
-  output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64);
-  output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64);
-  output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64);
-  output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64);
-  output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64);
-  output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64);
-  output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64);
-  output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64);
-  output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64);
-  output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64);
-  output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64);
-  output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64);
-  output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64);
-  output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
-}
-
-void av1_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void av1_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      // TODO(cd): see quality impact of only doing
-      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
-      //           PS: also change code in av1_dsp/x86/av1_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    av1_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-
-void av1_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
-  int r, c;
-  tran_low_t sum = 0;
-  for (r = 0; r < 32; ++r)
-    for (c = 0; c < 32; ++c) sum += input[r * stride + c];
-
-  output[0] = sum >> 3;
-  output[1] = 0;
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  av1_fdct4x4_c(input, output, stride);
-}
-
-void av1_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
-                          int stride) {
-  av1_fdct8x8_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
-                            int stride) {
-  av1_fdct8x8_1_c(input, final_output, stride);
-}
-
-void av1_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride) {
-  av1_fdct16x16_c(input, output, stride);
-}
-
-void av1_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
-                              int stride) {
-  av1_fdct16x16_1_c(input, output, stride);
-}
-
-void av1_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  av1_fdct32x32_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                               int stride) {
-  av1_fdct32x32_rd_c(input, out, stride);
-}
-
-void av1_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
-                              int stride) {
-  av1_fdct32x32_1_c(input, out, stride);
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH

diff --git a/av1/common/av1_fwd_txfm.h b/av1/common/av1_fwd_txfm.h
deleted file mode 100644
index db763e5..0000000
--- a/av1/common/av1_fwd_txfm.h
+++ /dev/null

@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AV1_COMMON_AV1_FWD_TXFM_H_
-#define AV1_COMMON_AV1_FWD_TXFM_H_
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/fwd_txfm.h"
-
-void av1_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif  // AV1_COMMON_AV1_FWD_TXFM_H_

diff --git a/av1/common/av1_fwd_txfm1d.c b/av1/common/av1_fwd_txfm1d.c
new file mode 100644
index 0000000..3e9d5ec
--- /dev/null
+++ b/av1/common/av1_fwd_txfm1d.c

@@ -0,0 +1,2326 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_fwd_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#define range_check(stage, input, buf, size, bit)                         \
+  {                                                                       \
+    int i, j;                                                             \
+    for (i = 0; i < size; ++i) {                                          \
+      int buf_bit = get_max_bit(abs(buf[i])) + 1;                         \
+      if (buf_bit > bit) {                                                \
+        printf("======== %s %d overflow ========\n", __FILE__, __LINE__); \
+        printf("stage: %d node: %d\n", stage, i);                         \
+        printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
+        printf("input:\n");                                               \
+        for (j = 0; j < size; j++) {                                      \
+          printf("%d,", input[j]);                                        \
+        }                                                                 \
+        printf("\n");                                                     \
+        assert(0);                                                        \
+      }                                                                   \
+    }                                                                     \
+  }
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
+  }
+#endif
+
+// TODO(angiebird): Make 1-d txfm functions static
+void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[3];
+  bf1[1] = input[0];
+  bf1[2] = input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[31];
+  bf1[1] = input[0];
+  bf1[2] = input[29];
+  bf1[3] = input[2];
+  bf1[4] = input[27];
+  bf1[5] = input[4];
+  bf1[6] = input[25];
+  bf1[7] = input[6];
+  bf1[8] = input[23];
+  bf1[9] = input[8];
+  bf1[10] = input[21];
+  bf1[11] = input[10];
+  bf1[12] = input[19];
+  bf1[13] = input[12];
+  bf1[14] = input[17];
+  bf1[15] = input[14];
+  bf1[16] = input[15];
+  bf1[17] = input[16];
+  bf1[18] = input[13];
+  bf1[19] = input[18];
+  bf1[20] = input[11];
+  bf1[21] = input[20];
+  bf1[22] = input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[7];
+  bf1[25] = input[24];
+  bf1[26] = input[5];
+  bf1[27] = input[26];
+  bf1[28] = input[3];
+  bf1[29] = input[28];
+  bf1[30] = input[1];
+  bf1[31] = input[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = -bf0[16] + bf0[0];
+  bf1[17] = -bf0[17] + bf0[1];
+  bf1[18] = -bf0[18] + bf0[2];
+  bf1[19] = -bf0[19] + bf0[3];
+  bf1[20] = -bf0[20] + bf0[4];
+  bf1[21] = -bf0[21] + bf0[5];
+  bf1[22] = -bf0[22] + bf0[6];
+  bf1[23] = -bf0[23] + bf0[7];
+  bf1[24] = -bf0[24] + bf0[8];
+  bf1[25] = -bf0[25] + bf0[9];
+  bf1[26] = -bf0[26] + bf0[10];
+  bf1[27] = -bf0[27] + bf0[11];
+  bf1[28] = -bf0[28] + bf0[12];
+  bf1[29] = -bf0[29] + bf0[13];
+  bf1[30] = -bf0[30] + bf0[14];
+  bf1[31] = -bf0[31] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = -bf0[24] + bf0[16];
+  bf1[25] = -bf0[25] + bf0[17];
+  bf1[26] = -bf0[26] + bf0[18];
+  bf1[27] = -bf0[27] + bf0[19];
+  bf1[28] = -bf0[28] + bf0[20];
+  bf1[29] = -bf0[29] + bf0[21];
+  bf1[30] = -bf0[30] + bf0[22];
+  bf1[31] = -bf0[31] + bf0[23];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = -bf0[20] + bf0[16];
+  bf1[21] = -bf0[21] + bf0[17];
+  bf1[22] = -bf0[22] + bf0[18];
+  bf1[23] = -bf0[23] + bf0[19];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = -bf0[28] + bf0[24];
+  bf1[29] = -bf0[29] + bf0[25];
+  bf1[30] = -bf0[30] + bf0[26];
+  bf1[31] = -bf0[31] + bf0[27];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = -bf0[18] + bf0[16];
+  bf1[19] = -bf0[19] + bf0[17];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = -bf0[22] + bf0[20];
+  bf1[23] = -bf0[23] + bf0[21];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = -bf0[26] + bf0[24];
+  bf1[27] = -bf0[27] + bf0[25];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = -bf0[30] + bf0[28];
+  bf1[31] = -bf0[31] + bf0[29];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[16];
+  bf1[2] = bf0[24];
+  bf1[3] = -bf0[8];
+  bf1[4] = bf0[12];
+  bf1[5] = -bf0[28];
+  bf1[6] = bf0[20];
+  bf1[7] = -bf0[4];
+  bf1[8] = bf0[6];
+  bf1[9] = -bf0[22];
+  bf1[10] = bf0[30];
+  bf1[11] = -bf0[14];
+  bf1[12] = bf0[10];
+  bf1[13] = -bf0[26];
+  bf1[14] = bf0[18];
+  bf1[15] = -bf0[2];
+  bf1[16] = bf0[3];
+  bf1[17] = -bf0[19];
+  bf1[18] = bf0[27];
+  bf1[19] = -bf0[11];
+  bf1[20] = bf0[15];
+  bf1[21] = -bf0[31];
+  bf1[22] = bf0[23];
+  bf1[23] = -bf0[7];
+  bf1[24] = bf0[5];
+  bf1[25] = -bf0[21];
+  bf1[26] = bf0[29];
+  bf1[27] = -bf0[13];
+  bf1[28] = bf0[9];
+  bf1[29] = -bf0[25];
+  bf1[30] = bf0[17];
+  bf1[31] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+#if CONFIG_TX64X64
+void av1_fdct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+#endif  // CONFIG_TX64X64

diff --git a/av1/common/av1_fwd_txfm1d.h b/av1/common/av1_fwd_txfm1d.h
new file mode 100644
index 0000000..7aab70e
--- /dev/null
+++ b/av1/common/av1_fwd_txfm1d.h

@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_FWD_TXFM1D_H_
+#define AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_fdct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_fadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_FWD_TXFM1D_H_

diff --git a/av1/common/av1_fwd_txfm2d.c b/av1/common/av1_fwd_txfm2d.c
new file mode 100644
index 0000000..dc984e1
--- /dev/null
+++ b/av1/common/av1_fwd_txfm2d.c

@@ -0,0 +1,177 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_fdct4_new;
+    case TXFM_TYPE_DCT8: return av1_fdct8_new;
+    case TXFM_TYPE_DCT16: return av1_fdct16_new;
+    case TXFM_TYPE_DCT32: return av1_fdct32_new;
+    case TXFM_TYPE_ADST4: return av1_fadst4_new;
+    case TXFM_TYPE_ADST8: return av1_fadst8_new;
+    case TXFM_TYPE_ADST16: return av1_fadst16_new;
+    case TXFM_TYPE_ADST32: return av1_fadst32_new;
+    default: assert(0); return NULL;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf) {
+  int c, r;
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t *temp_in = output;
+  int32_t *temp_out = output + txfm_size;
+
+  // Columns
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size - r - 1) * stride + c];
+    }
+    round_shift_array(temp_in, txfm_size, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) buf[r * txfm_size + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip from left to right
+        buf[r * txfm_size + (txfm_size - c - 1)] = temp_out[r];
+    }
+  }
+
+  // Rows
+  for (r = 0; r < txfm_size; ++r) {
+    txfm_func_row(buf + r * txfm_size, output + r * txfm_size, cos_bit_row,
+                  stage_range_row);
+    round_shift_array(output + r * txfm_size, txfm_size, -shift[2]);
+  }
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_4X4);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_8X8);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+                            int tx_type, int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_16X16);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+                            int tx_type, int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+                            int tx_type, int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+#if CONFIG_EXT_TX
+static const TXFM_2D_CFG *fwd_txfm_cfg_ls[FLIPADST_ADST + 1][TX_SIZES] = {
+  { &fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_dct_8,
+    &fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_dct_32 },
+  { &fwd_txfm_2d_cfg_adst_dct_4, &fwd_txfm_2d_cfg_adst_dct_8,
+    &fwd_txfm_2d_cfg_adst_dct_16, &fwd_txfm_2d_cfg_adst_dct_32 },
+  { &fwd_txfm_2d_cfg_dct_adst_4, &fwd_txfm_2d_cfg_dct_adst_8,
+    &fwd_txfm_2d_cfg_dct_adst_16, &fwd_txfm_2d_cfg_dct_adst_32 },
+  { &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+    &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+  { &fwd_txfm_2d_cfg_adst_dct_4, &fwd_txfm_2d_cfg_adst_dct_8,
+    &fwd_txfm_2d_cfg_adst_dct_16, &fwd_txfm_2d_cfg_adst_dct_32 },
+  { &fwd_txfm_2d_cfg_dct_adst_4, &fwd_txfm_2d_cfg_dct_adst_8,
+    &fwd_txfm_2d_cfg_dct_adst_16, &fwd_txfm_2d_cfg_dct_adst_32 },
+  { &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+    &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+  { &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+    &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+  { &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+    &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+};
+#else   // CONFIG_EXT_TX
+static const TXFM_2D_CFG *fwd_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+  { &fwd_txfm_2d_cfg_dct_dct_4, &fwd_txfm_2d_cfg_dct_dct_8,
+    &fwd_txfm_2d_cfg_dct_dct_16, &fwd_txfm_2d_cfg_dct_dct_32 },
+  { &fwd_txfm_2d_cfg_adst_dct_4, &fwd_txfm_2d_cfg_adst_dct_8,
+    &fwd_txfm_2d_cfg_adst_dct_16, &fwd_txfm_2d_cfg_adst_dct_32 },
+  { &fwd_txfm_2d_cfg_dct_adst_4, &fwd_txfm_2d_cfg_dct_adst_8,
+    &fwd_txfm_2d_cfg_dct_adst_16, &fwd_txfm_2d_cfg_dct_adst_32 },
+  { &fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+    &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32 },
+};
+#endif  // CONFIG_EXT_TX
+
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = fwd_txfm_cfg_ls[tx_type][tx_size];
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.cfg = &fwd_txfm_2d_cfg_dct_dct_64;
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    default:
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
+      assert(0);
+  }
+  return cfg;
+}

diff --git a/av1/common/av1_fwd_txfm2d_cfg.h b/av1/common/av1_fwd_txfm2d_cfg.h
new file mode 100644
index 0000000..5a7c218
--- /dev/null
+++ b/av1/common/av1_fwd_txfm2d_cfg.h

@@ -0,0 +1,443 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_FWD_TXFM2D_CFG_H_
+#define AV1_FWD_TXFM2D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+//  ---------------- config fwd_dct_dct_4 ----------------
+static const int8_t fwd_shift_dct_dct_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_4[4] = { 15, 16, 17, 17 };
+static const int8_t fwd_stage_range_row_dct_dct_4[4] = { 17, 18, 18, 18 };
+static const int8_t fwd_cos_bit_col_dct_dct_4[4] = { 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_dct_4,            // .shift
+  fwd_stage_range_col_dct_dct_4,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_4,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_4,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_8 ----------------
+static const int8_t fwd_shift_dct_dct_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_8[6] = {
+  15, 16, 17, 18, 18, 18
+};
+static const int8_t fwd_stage_range_row_dct_dct_8[6] = {
+  17, 18, 19, 19, 19, 19
+};
+static const int8_t fwd_cos_bit_col_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_dct_8,            // .shift
+  fwd_stage_range_col_dct_dct_8,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_8,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_8,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_16 ----------------
+static const int8_t fwd_shift_dct_dct_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_16[8] = { 15, 16, 17, 18,
+                                                          19, 19, 19, 19 };
+static const int8_t fwd_stage_range_row_dct_dct_16[8] = { 17, 18, 19, 20,
+                                                          20, 20, 20, 20 };
+static const int8_t fwd_cos_bit_col_dct_dct_16[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_16[8] = { 12, 12, 12, 12,
+                                                      12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_dct_16,            // .shift
+  fwd_stage_range_col_dct_dct_16,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_16,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_16,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_32 ----------------
+static const int8_t fwd_shift_dct_dct_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_dct_dct_32[10] = { 15, 16, 17, 18, 19,
+                                                           20, 20, 20, 20, 20 };
+static const int8_t fwd_stage_range_row_dct_dct_32[10] = { 16, 17, 18, 19, 20,
+                                                           20, 20, 20, 20, 20 };
+static const int8_t fwd_cos_bit_col_dct_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12 };
+static const int8_t fwd_cos_bit_row_dct_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_dct_dct_32,            // .shift
+  fwd_stage_range_col_dct_dct_32,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_32,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_32,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_64 ----------------
+static const int8_t fwd_shift_dct_dct_64[3] = { 0, -2, -2 };
+static const int8_t fwd_stage_range_col_dct_dct_64[12] = {
+  13, 14, 15, 16, 17, 18, 19, 19, 19, 19, 19, 19
+};
+static const int8_t fwd_stage_range_row_dct_dct_64[12] = {
+  17, 18, 19, 20, 21, 22, 22, 22, 22, 22, 22, 22
+};
+static const int8_t fwd_cos_bit_col_dct_dct_64[12] = { 15, 15, 15, 15, 15, 14,
+                                                       13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_dct_64[12] = { 15, 14, 13, 12, 11, 10,
+                                                       10, 10, 10, 10, 10, 10 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_64 = {
+  64,                              // .txfm_size
+  12,                              // .stage_num_col
+  12,                              // .stage_num_row
+  fwd_shift_dct_dct_64,            // .shift
+  fwd_stage_range_col_dct_dct_64,  // .stage_range_col
+  fwd_stage_range_row_dct_dct_64,  // .stage_range_row
+  fwd_cos_bit_col_dct_dct_64,      // .cos_bit_col
+  fwd_cos_bit_row_dct_dct_64,      // .cos_bit_row
+  TXFM_TYPE_DCT64,                 // .txfm_type_col
+  TXFM_TYPE_DCT64
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_4 ----------------
+static const int8_t fwd_shift_dct_adst_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_4[4] = { 15, 16, 17, 17 };
+static const int8_t fwd_stage_range_row_dct_adst_4[6] = {
+  17, 17, 17, 18, 18, 18
+};
+static const int8_t fwd_cos_bit_col_dct_adst_4[4] = { 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_adst_4,            // .shift
+  fwd_stage_range_col_dct_adst_4,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_4,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_4,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_8 ----------------
+static const int8_t fwd_shift_dct_adst_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_8[6] = {
+  15, 16, 17, 18, 18, 18
+};
+static const int8_t fwd_stage_range_row_dct_adst_8[8] = { 17, 17, 17, 18,
+                                                          18, 19, 19, 19 };
+static const int8_t fwd_cos_bit_col_dct_adst_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_adst_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_adst_8,            // .shift
+  fwd_stage_range_col_dct_adst_8,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_8,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_8,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_16 ----------------
+static const int8_t fwd_shift_dct_adst_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_16[8] = { 15, 16, 17, 18,
+                                                           19, 19, 19, 19 };
+static const int8_t fwd_stage_range_row_dct_adst_16[10] = {
+  17, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_dct_adst_16[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_dct_adst_16,            // .shift
+  fwd_stage_range_col_dct_adst_16,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_16,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_16,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_32 ----------------
+static const int8_t fwd_shift_dct_adst_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_dct_adst_32[10] = {
+  15, 16, 17, 18, 19, 20, 20, 20, 20, 20
+};
+static const int8_t fwd_stage_range_row_dct_adst_32[12] = {
+  16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_dct_adst_32[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+static const int8_t fwd_cos_bit_row_dct_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_dct_adst_32,            // .shift
+  fwd_stage_range_col_dct_adst_32,  // .stage_range_col
+  fwd_stage_range_row_dct_adst_32,  // .stage_range_row
+  fwd_cos_bit_col_dct_adst_32,      // .cos_bit_col
+  fwd_cos_bit_row_dct_adst_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+//  ---------------- config fwd_adst_adst_4 ----------------
+static const int8_t fwd_shift_adst_adst_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_4[6] = { 15, 15, 16,
+                                                           17, 17, 17 };
+static const int8_t fwd_stage_range_row_adst_adst_4[6] = { 17, 17, 17,
+                                                           18, 18, 18 };
+static const int8_t fwd_cos_bit_col_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_adst_4,            // .shift
+  fwd_stage_range_col_adst_adst_4,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_4,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_4,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_8 ----------------
+static const int8_t fwd_shift_adst_adst_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_8[8] = { 15, 15, 16, 17,
+                                                           17, 18, 18, 18 };
+static const int8_t fwd_stage_range_row_adst_adst_8[8] = { 17, 17, 17, 18,
+                                                           18, 19, 19, 19 };
+static const int8_t fwd_cos_bit_col_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_adst_8,            // .shift
+  fwd_stage_range_col_adst_adst_8,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_8,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_8,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_16 ----------------
+static const int8_t fwd_shift_adst_adst_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_16[10] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 19
+};
+static const int8_t fwd_stage_range_row_adst_adst_16[10] = {
+  17, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_adst_adst_16[10] = { 13, 13, 13, 13, 13,
+                                                         13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                         12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_adst_16,            // .shift
+  fwd_stage_range_col_adst_adst_16,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_16,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_16,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_32 ----------------
+static const int8_t fwd_shift_adst_adst_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_adst_adst_32[12] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_stage_range_row_adst_adst_32[12] = {
+  16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_adst_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+static const int8_t fwd_cos_bit_row_adst_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_adst_adst_32,            // .shift
+  fwd_stage_range_col_adst_adst_32,  // .stage_range_col
+  fwd_stage_range_row_adst_adst_32,  // .stage_range_row
+  fwd_cos_bit_col_adst_adst_32,      // .cos_bit_col
+  fwd_cos_bit_row_adst_adst_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_4 ----------------
+static const int8_t fwd_shift_adst_dct_4[3] = { 2, 0, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_4[6] = {
+  15, 15, 16, 17, 17, 17
+};
+static const int8_t fwd_stage_range_row_adst_dct_4[4] = { 17, 18, 18, 18 };
+static const int8_t fwd_cos_bit_col_adst_dct_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_dct_4,            // .shift
+  fwd_stage_range_col_adst_dct_4,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_4,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_4,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_8 ----------------
+static const int8_t fwd_shift_adst_dct_8[3] = { 2, -1, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_8[8] = { 15, 15, 16, 17,
+                                                          17, 18, 18, 18 };
+static const int8_t fwd_stage_range_row_adst_dct_8[6] = {
+  17, 18, 19, 19, 19, 19
+};
+static const int8_t fwd_cos_bit_col_adst_dct_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_dct_8,            // .shift
+  fwd_stage_range_col_adst_dct_8,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_8,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_8,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_16 ----------------
+static const int8_t fwd_shift_adst_dct_16[3] = { 2, -2, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_16[10] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 19
+};
+static const int8_t fwd_stage_range_row_adst_dct_16[8] = { 17, 18, 19, 20,
+                                                           20, 20, 20, 20 };
+static const int8_t fwd_cos_bit_col_adst_dct_16[10] = { 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_adst_dct_16[8] = { 12, 12, 12, 12,
+                                                       12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  fwd_shift_adst_dct_16,            // .shift
+  fwd_stage_range_col_adst_dct_16,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_16,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_16,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_32 ----------------
+static const int8_t fwd_shift_adst_dct_32[3] = { 2, -4, 0 };
+static const int8_t fwd_stage_range_col_adst_dct_32[12] = {
+  15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20
+};
+static const int8_t fwd_stage_range_row_adst_dct_32[10] = {
+  16, 17, 18, 19, 20, 20, 20, 20, 20, 20
+};
+static const int8_t fwd_cos_bit_col_adst_dct_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+static const int8_t fwd_cos_bit_row_adst_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  fwd_shift_adst_dct_32,            // .shift
+  fwd_stage_range_col_adst_dct_32,  // .stage_range_col
+  fwd_stage_range_row_adst_dct_32,  // .stage_range_row
+  fwd_cos_bit_col_adst_dct_32,      // .cos_bit_col
+  fwd_cos_bit_row_adst_dct_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};      // .txfm_type_row
+#endif  // AV1_FWD_TXFM2D_CFG_H_

diff --git a/av1/common/av1_inv_txfm.c b/av1/common/av1_inv_txfm.c
deleted file mode 100644
index bfe617b..0000000
--- a/av1/common/av1_inv_txfm.c
+++ /dev/null

@@ -1,2477 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <math.h>
-#include <string.h>
-
-#include "av1/common/av1_inv_txfm.h"
-
-void av1_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = WRAPLOW(a1, 8);
-    op[1] = WRAPLOW(b1, 8);
-    op[2] = WRAPLOW(c1, 8);
-    op[3] = WRAPLOW(d1, 8);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1);
-    dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1);
-    dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1);
-    dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1);
-
-    ip++;
-    dest++;
-  }
-}
-
-void av1_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = WRAPLOW(a1, 8);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, 8);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
-    dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
-    dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
-    dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
-    ip++;
-    dest++;
-  }
-}
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], 8);
-  output[1] = WRAPLOW(step[1] + step[2], 8);
-  output[2] = WRAPLOW(step[1] - step[2], 8);
-  output[3] = WRAPLOW(step[0] - step[3], 8);
-}
-
-void av1_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    av1_idct4_c(input, outptr);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    av1_idct4_c(temp_in, temp_out);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
-    }
-  }
-}
-
-void av1_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
-                         int dest_stride) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = clip_pixel_add(dest[0], a1);
-    dest[1] = clip_pixel_add(dest[1], a1);
-    dest[2] = clip_pixel_add(dest[2], a1);
-    dest[3] = clip_pixel_add(dest[3], a1);
-    dest += dest_stride;
-  }
-}
-
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  // stage 2
-  temp1 = (step1[0] + step1[2]) * cospi_16_64;
-  temp2 = (step1[0] - step1[2]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
-
-  // stage 3
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], 8);
-  output[1] = WRAPLOW(step1[1] + step1[6], 8);
-  output[2] = WRAPLOW(step1[2] + step1[5], 8);
-  output[3] = WRAPLOW(step1[3] + step1[4], 8);
-  output[4] = WRAPLOW(step1[3] - step1[4], 8);
-  output[5] = WRAPLOW(step1[2] - step1[5], 8);
-  output[6] = WRAPLOW(step1[1] - step1[6], 8);
-  output[7] = WRAPLOW(step1[0] - step1[7], 8);
-}
-
-void av1_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    av1_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void av1_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-#if CONFIG_CB4X4
-void av1_iadst2_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1;
-  const tran_low_t x0 = input[0];
-  const tran_low_t x1 = input[1];
-
-  s0 = sinpi_1_5 * x0 + sinpi_2_5 * x1;
-  s1 = sinpi_2_5 * x0 - sinpi_1_5 * x1;
-
-  output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
-}
-#endif
-
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-
-  if (!(x0 | x1 | x2 | x3)) {
-    output[0] = output[1] = output[2] = output[3] = 0;
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
-}
-
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
-  s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
-  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
-  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
-  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
-  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
-  s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
-  s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
-
-  // stage 2
-  s0 = (int)x0;
-  s1 = (int)x1;
-  s2 = (int)x2;
-  s3 = (int)x3;
-  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
-  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
-  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
-  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
-
-  x0 = WRAPLOW(s0 + s2, 8);
-  x1 = WRAPLOW(s1 + s3, 8);
-  x2 = WRAPLOW(s0 - s2, 8);
-  x3 = WRAPLOW(s1 - s3, 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-
-  // stage 3
-  s2 = (int)(cospi_16_64 * (x2 + x3));
-  s3 = (int)(cospi_16_64 * (x2 - x3));
-  s6 = (int)(cospi_16_64 * (x6 + x7));
-  s7 = (int)(cospi_16_64 * (x6 - x7));
-
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x4, 8);
-  output[2] = WRAPLOW(x6, 8);
-  output[3] = WRAPLOW(-x2, 8);
-  output[4] = WRAPLOW(x3, 8);
-  output[5] = WRAPLOW(-x7, 8);
-  output[6] = WRAPLOW(x5, 8);
-  output[7] = WRAPLOW(-x1, 8);
-}
-
-void av1_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  // only first 4 row has non-zero coefs
-  for (i = 0; i < 4; ++i) {
-    av1_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], 8);
-  output[1] = WRAPLOW(step2[1] + step2[14], 8);
-  output[2] = WRAPLOW(step2[2] + step2[13], 8);
-  output[3] = WRAPLOW(step2[3] + step2[12], 8);
-  output[4] = WRAPLOW(step2[4] + step2[11], 8);
-  output[5] = WRAPLOW(step2[5] + step2[10], 8);
-  output[6] = WRAPLOW(step2[6] + step2[9], 8);
-  output[7] = WRAPLOW(step2[7] + step2[8], 8);
-  output[8] = WRAPLOW(step2[7] - step2[8], 8);
-  output[9] = WRAPLOW(step2[6] - step2[9], 8);
-  output[10] = WRAPLOW(step2[5] - step2[10], 8);
-  output[11] = WRAPLOW(step2[4] - step2[11], 8);
-  output[12] = WRAPLOW(step2[3] - step2[12], 8);
-  output[13] = WRAPLOW(step2[2] - step2[13], 8);
-  output[14] = WRAPLOW(step2[1] - step2[14], 8);
-  output[15] = WRAPLOW(step2[0] - step2[15], 8);
-}
-
-void av1_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    av1_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
-        output[6] = output[7] = output[8] = output[9] = output[10] =
-            output[11] = output[12] = output[13] = output[14] = output[15] = 0;
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8);
-  x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = WRAPLOW(s0 + s4, 8);
-  x1 = WRAPLOW(s1 + s5, 8);
-  x2 = WRAPLOW(s2 + s6, 8);
-  x3 = WRAPLOW(s3 + s7, 8);
-  x4 = WRAPLOW(s0 - s4, 8);
-  x5 = WRAPLOW(s1 - s5, 8);
-  x6 = WRAPLOW(s2 - s6, 8);
-  x7 = WRAPLOW(s3 - s7, 8);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = WRAPLOW(check_range(s0 + s2), 8);
-  x1 = WRAPLOW(check_range(s1 + s3), 8);
-  x2 = WRAPLOW(check_range(s0 - s2), 8);
-  x3 = WRAPLOW(check_range(s1 - s3), 8);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
-  x8 = WRAPLOW(check_range(s8 + s10), 8);
-  x9 = WRAPLOW(check_range(s9 + s11), 8);
-  x10 = WRAPLOW(check_range(s8 - s10), 8);
-  x11 = WRAPLOW(check_range(s9 - s11), 8);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = WRAPLOW(dct_const_round_shift(s2), 8);
-  x3 = WRAPLOW(dct_const_round_shift(s3), 8);
-  x6 = WRAPLOW(dct_const_round_shift(s6), 8);
-  x7 = WRAPLOW(dct_const_round_shift(s7), 8);
-  x10 = WRAPLOW(dct_const_round_shift(s10), 8);
-  x11 = WRAPLOW(dct_const_round_shift(s11), 8);
-  x14 = WRAPLOW(dct_const_round_shift(s14), 8);
-  x15 = WRAPLOW(dct_const_round_shift(s15), 8);
-
-  output[0] = WRAPLOW(x0, 8);
-  output[1] = WRAPLOW(-x8, 8);
-  output[2] = WRAPLOW(x12, 8);
-  output[3] = WRAPLOW(-x4, 8);
-  output[4] = WRAPLOW(x6, 8);
-  output[5] = WRAPLOW(x14, 8);
-  output[6] = WRAPLOW(x10, 8);
-  output[7] = WRAPLOW(x2, 8);
-  output[8] = WRAPLOW(x3, 8);
-  output[9] = WRAPLOW(x11, 8);
-  output[10] = WRAPLOW(x15, 8);
-  output[11] = WRAPLOW(x7, 8);
-  output[12] = WRAPLOW(x5, 8);
-  output[13] = WRAPLOW(-x13, 8);
-  output[14] = WRAPLOW(x9, 8);
-  output[15] = WRAPLOW(-x1, 8);
-}
-
-void av1_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    av1_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], 8);
-  step2[17] = WRAPLOW(step1[16] - step1[17], 8);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], 8);
-  step2[19] = WRAPLOW(step1[18] + step1[19], 8);
-  step2[20] = WRAPLOW(step1[20] + step1[21], 8);
-  step2[21] = WRAPLOW(step1[20] - step1[21], 8);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], 8);
-  step2[23] = WRAPLOW(step1[22] + step1[23], 8);
-  step2[24] = WRAPLOW(step1[24] + step1[25], 8);
-  step2[25] = WRAPLOW(step1[24] - step1[25], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], 8);
-  step2[27] = WRAPLOW(step1[26] + step1[27], 8);
-  step2[28] = WRAPLOW(step1[28] + step1[29], 8);
-  step2[29] = WRAPLOW(step1[28] - step1[29], 8);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], 8);
-  step2[31] = WRAPLOW(step1[30] + step1[31], 8);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], 8);
-  step1[9] = WRAPLOW(step2[8] - step2[9], 8);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], 8);
-  step1[11] = WRAPLOW(step2[10] + step2[11], 8);
-  step1[12] = WRAPLOW(step2[12] + step2[13], 8);
-  step1[13] = WRAPLOW(step2[12] - step2[13], 8);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], 8);
-  step1[15] = WRAPLOW(step2[14] + step2[15], 8);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[4] = WRAPLOW(step1[4] + step1[5], 8);
-  step2[5] = WRAPLOW(step1[4] - step1[5], 8);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], 8);
-  step2[7] = WRAPLOW(step1[6] + step1[7], 8);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = WRAPLOW(step1[16] + step1[19], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[18], 8);
-  step2[18] = WRAPLOW(step1[17] - step1[18], 8);
-  step2[19] = WRAPLOW(step1[16] - step1[19], 8);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], 8);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], 8);
-  step2[22] = WRAPLOW(step1[21] + step1[22], 8);
-  step2[23] = WRAPLOW(step1[20] + step1[23], 8);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], 8);
-  step2[25] = WRAPLOW(step1[25] + step1[26], 8);
-  step2[26] = WRAPLOW(step1[25] - step1[26], 8);
-  step2[27] = WRAPLOW(step1[24] - step1[27], 8);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], 8);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], 8);
-  step2[30] = WRAPLOW(step1[29] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[28] + step1[31], 8);
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[2], 8);
-  step1[2] = WRAPLOW(step2[1] - step2[2], 8);
-  step1[3] = WRAPLOW(step2[0] - step2[3], 8);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11], 8);
-  step1[9] = WRAPLOW(step2[9] + step2[10], 8);
-  step1[10] = WRAPLOW(step2[9] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[8] - step2[11], 8);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], 8);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], 8);
-  step1[14] = WRAPLOW(step2[13] + step2[14], 8);
-  step1[15] = WRAPLOW(step2[12] + step2[15], 8);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], 8);
-  step2[1] = WRAPLOW(step1[1] + step1[6], 8);
-  step2[2] = WRAPLOW(step1[2] + step1[5], 8);
-  step2[3] = WRAPLOW(step1[3] + step1[4], 8);
-  step2[4] = WRAPLOW(step1[3] - step1[4], 8);
-  step2[5] = WRAPLOW(step1[2] - step1[5], 8);
-  step2[6] = WRAPLOW(step1[1] - step1[6], 8);
-  step2[7] = WRAPLOW(step1[0] - step1[7], 8);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = WRAPLOW(step1[16] + step1[23], 8);
-  step2[17] = WRAPLOW(step1[17] + step1[22], 8);
-  step2[18] = WRAPLOW(step1[18] + step1[21], 8);
-  step2[19] = WRAPLOW(step1[19] + step1[20], 8);
-  step2[20] = WRAPLOW(step1[19] - step1[20], 8);
-  step2[21] = WRAPLOW(step1[18] - step1[21], 8);
-  step2[22] = WRAPLOW(step1[17] - step1[22], 8);
-  step2[23] = WRAPLOW(step1[16] - step1[23], 8);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], 8);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], 8);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], 8);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], 8);
-  step2[28] = WRAPLOW(step1[27] + step1[28], 8);
-  step2[29] = WRAPLOW(step1[26] + step1[29], 8);
-  step2[30] = WRAPLOW(step1[25] + step1[30], 8);
-  step2[31] = WRAPLOW(step1[24] + step1[31], 8);
-
-  // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], 8);
-  step1[1] = WRAPLOW(step2[1] + step2[14], 8);
-  step1[2] = WRAPLOW(step2[2] + step2[13], 8);
-  step1[3] = WRAPLOW(step2[3] + step2[12], 8);
-  step1[4] = WRAPLOW(step2[4] + step2[11], 8);
-  step1[5] = WRAPLOW(step2[5] + step2[10], 8);
-  step1[6] = WRAPLOW(step2[6] + step2[9], 8);
-  step1[7] = WRAPLOW(step2[7] + step2[8], 8);
-  step1[8] = WRAPLOW(step2[7] - step2[8], 8);
-  step1[9] = WRAPLOW(step2[6] - step2[9], 8);
-  step1[10] = WRAPLOW(step2[5] - step2[10], 8);
-  step1[11] = WRAPLOW(step2[4] - step2[11], 8);
-  step1[12] = WRAPLOW(step2[3] - step2[12], 8);
-  step1[13] = WRAPLOW(step2[2] - step2[13], 8);
-  step1[14] = WRAPLOW(step2[1] - step2[14], 8);
-  step1[15] = WRAPLOW(step2[0] - step2[15], 8);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8);
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], 8);
-  output[1] = WRAPLOW(step1[1] + step1[30], 8);
-  output[2] = WRAPLOW(step1[2] + step1[29], 8);
-  output[3] = WRAPLOW(step1[3] + step1[28], 8);
-  output[4] = WRAPLOW(step1[4] + step1[27], 8);
-  output[5] = WRAPLOW(step1[5] + step1[26], 8);
-  output[6] = WRAPLOW(step1[6] + step1[25], 8);
-  output[7] = WRAPLOW(step1[7] + step1[24], 8);
-  output[8] = WRAPLOW(step1[8] + step1[23], 8);
-  output[9] = WRAPLOW(step1[9] + step1[22], 8);
-  output[10] = WRAPLOW(step1[10] + step1[21], 8);
-  output[11] = WRAPLOW(step1[11] + step1[20], 8);
-  output[12] = WRAPLOW(step1[12] + step1[19], 8);
-  output[13] = WRAPLOW(step1[13] + step1[18], 8);
-  output[14] = WRAPLOW(step1[14] + step1[17], 8);
-  output[15] = WRAPLOW(step1[15] + step1[16], 8);
-  output[16] = WRAPLOW(step1[15] - step1[16], 8);
-  output[17] = WRAPLOW(step1[14] - step1[17], 8);
-  output[18] = WRAPLOW(step1[13] - step1[18], 8);
-  output[19] = WRAPLOW(step1[12] - step1[19], 8);
-  output[20] = WRAPLOW(step1[11] - step1[20], 8);
-  output[21] = WRAPLOW(step1[10] - step1[21], 8);
-  output[22] = WRAPLOW(step1[9] - step1[22], 8);
-  output[23] = WRAPLOW(step1[8] - step1[23], 8);
-  output[24] = WRAPLOW(step1[7] - step1[24], 8);
-  output[25] = WRAPLOW(step1[6] - step1[25], 8);
-  output[26] = WRAPLOW(step1[5] - step1[26], 8);
-  output[27] = WRAPLOW(step1[4] - step1[27], 8);
-  output[28] = WRAPLOW(step1[3] - step1[28], 8);
-  output[29] = WRAPLOW(step1[2] - step1[29], 8);
-  output[30] = WRAPLOW(step1[1] - step1[30], 8);
-  output[31] = WRAPLOW(step1[0] - step1[31], 8);
-}
-
-void av1_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      av1_idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    av1_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    av1_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    av1_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void av1_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
-     0.5 shifts per pixel. */
-  int i;
-  tran_low_t output[16];
-  tran_high_t a1, b1, c1, d1, e1;
-  const tran_low_t *ip = input;
-  tran_low_t *op = output;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  for (i = 0; i < 4; i++) {
-    a1 = ip[0] >> UNIT_QUANT_SHIFT;
-    c1 = ip[1] >> UNIT_QUANT_SHIFT;
-    d1 = ip[2] >> UNIT_QUANT_SHIFT;
-    b1 = ip[3] >> UNIT_QUANT_SHIFT;
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    op[0] = WRAPLOW(a1, bd);
-    op[1] = WRAPLOW(b1, bd);
-    op[2] = WRAPLOW(c1, bd);
-    op[3] = WRAPLOW(d1, bd);
-    ip += 4;
-    op += 4;
-  }
-
-  ip = output;
-  for (i = 0; i < 4; i++) {
-    a1 = ip[4 * 0];
-    c1 = ip[4 * 1];
-    d1 = ip[4 * 2];
-    b1 = ip[4 * 3];
-    a1 += c1;
-    d1 -= b1;
-    e1 = (a1 - d1) >> 1;
-    b1 = e1 - b1;
-    c1 = e1 - c1;
-    a1 -= b1;
-    d1 += c1;
-    dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
-    dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
-    dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
-    dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
-
-    ip++;
-    dest++;
-  }
-}
-
-void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1, e1;
-  tran_low_t tmp[4];
-  const tran_low_t *ip = in;
-  tran_low_t *op = tmp;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  (void)bd;
-
-  a1 = ip[0] >> UNIT_QUANT_SHIFT;
-  e1 = a1 >> 1;
-  a1 -= e1;
-  op[0] = WRAPLOW(a1, bd);
-  op[1] = op[2] = op[3] = WRAPLOW(e1, bd);
-
-  ip = tmp;
-  for (i = 0; i < 4; i++) {
-    e1 = ip[0] >> 1;
-    a1 = ip[0] - e1;
-    dest[dest_stride * 0] =
-        highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
-    dest[dest_stride * 1] =
-        highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
-    dest[dest_stride * 2] =
-        highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
-    dest[dest_stride * 3] =
-        highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
-    ip++;
-    dest++;
-  }
-}
-
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step[4];
-  tran_high_t temp1, temp2;
-  (void)bd;
-  // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  // stage 2
-  output[0] = WRAPLOW(step[0] + step[3], bd);
-  output[1] = WRAPLOW(step[1] + step[2], bd);
-  output[2] = WRAPLOW(step[1] - step[2], bd);
-  output[3] = WRAPLOW(step[0] - step[3], bd);
-}
-
-void av1_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[4], temp_out[4];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  for (i = 0; i < 4; ++i) {
-    av1_highbd_idct4_c(input, outptr, bd);
-    input += 4;
-    outptr += 4;
-  }
-
-  // Columns
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    av1_highbd_idct4_c(temp_in, temp_out, bd);
-    for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-    }
-  }
-}
-
-void av1_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int dest_stride, int bd) {
-  int i;
-  tran_high_t a1;
-  tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 4);
-
-  for (i = 0; i < 4; i++) {
-    dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
-    dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
-    dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
-    dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
-    dest += dest_stride;
-  }
-}
-
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step1[8], step2[8];
-  tran_high_t temp1, temp2;
-  // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  // stage 2 & stage 3 - even half
-  av1_highbd_idct4_c(step1, step1, bd);
-
-  // stage 2 - odd half
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
-
-  // stage 3 - odd half
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step1[7] = step2[7];
-
-  // stage 4
-  output[0] = WRAPLOW(step1[0] + step1[7], bd);
-  output[1] = WRAPLOW(step1[1] + step1[6], bd);
-  output[2] = WRAPLOW(step1[2] + step1[5], bd);
-  output[3] = WRAPLOW(step1[3] + step1[4], bd);
-  output[4] = WRAPLOW(step1[3] - step1[4], bd);
-  output[5] = WRAPLOW(step1[2] - step1[5], bd);
-  output[6] = WRAPLOW(step1[1] - step1[6], bd);
-  output[7] = WRAPLOW(step1[0] - step1[7], bd);
-}
-
-void av1_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  for (i = 0; i < 8; ++i) {
-    av1_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
-}
-
-void av1_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                int stride, int bd) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[0];
-  tran_low_t x1 = input[1];
-  tran_low_t x2 = input[2];
-  tran_low_t x3 = input[3];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3)) {
-    memset(output, 0, 4 * sizeof(*output));
-    return;
-  }
-
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
-  s7 = (tran_high_t)(x0 - x2 + x3);
-
-  s0 = s0 + s3 + s5;
-  s1 = s1 - s4 - s6;
-  s3 = s2;
-  s2 = sinpi_3_9 * s7;
-
-  // 1-D transform scaling factor is sqrt(2).
-  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
-  // + 1b (addition) = 29b.
-  // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
-  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
-  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
-}
-
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-
-  tran_low_t x0 = input[7];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[5];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[3];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[1];
-  tran_low_t x7 = input[6];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    memset(output, 0, 8 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
-
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-
-  // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
-
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x4, bd);
-  output[2] = WRAPLOW(x6, bd);
-  output[3] = WRAPLOW(-x2, bd);
-  output[4] = WRAPLOW(x3, bd);
-  output[5] = WRAPLOW(-x7, bd);
-  output[6] = WRAPLOW(x5, bd);
-  output[7] = WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                 int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  // Only first 4 row has non-zero coefs.
-  for (i = 0; i < 4; ++i) {
-    av1_highbd_idct8_c(input, outptr, bd);
-    input += 8;
-    outptr += 8;
-  }
-  // Then transform columns.
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    av1_highbd_idct8_c(temp_in, temp_out, bd);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-    }
-  }
-}
-
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_low_t step1[16], step2[16];
-  tran_high_t temp1, temp2;
-  (void)bd;
-
-  // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  // stage 7
-  output[0] = WRAPLOW(step2[0] + step2[15], bd);
-  output[1] = WRAPLOW(step2[1] + step2[14], bd);
-  output[2] = WRAPLOW(step2[2] + step2[13], bd);
-  output[3] = WRAPLOW(step2[3] + step2[12], bd);
-  output[4] = WRAPLOW(step2[4] + step2[11], bd);
-  output[5] = WRAPLOW(step2[5] + step2[10], bd);
-  output[6] = WRAPLOW(step2[6] + step2[9], bd);
-  output[7] = WRAPLOW(step2[7] + step2[8], bd);
-  output[8] = WRAPLOW(step2[7] - step2[8], bd);
-  output[9] = WRAPLOW(step2[6] - step2[9], bd);
-  output[10] = WRAPLOW(step2[5] - step2[10], bd);
-  output[11] = WRAPLOW(step2[4] - step2[11], bd);
-  output[12] = WRAPLOW(step2[3] - step2[12], bd);
-  output[13] = WRAPLOW(step2[2] - step2[13], bd);
-  output[14] = WRAPLOW(step2[1] - step2[14], bd);
-  output[15] = WRAPLOW(step2[0] - step2[15], bd);
-}
-
-void av1_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows.
-  for (i = 0; i < 16; ++i) {
-    av1_highbd_idct16_c(input, outptr, bd);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_highbd_idct16_c(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
-  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
-  tran_high_t s9, s10, s11, s12, s13, s14, s15;
-
-  tran_low_t x0 = input[15];
-  tran_low_t x1 = input[0];
-  tran_low_t x2 = input[13];
-  tran_low_t x3 = input[2];
-  tran_low_t x4 = input[11];
-  tran_low_t x5 = input[4];
-  tran_low_t x6 = input[9];
-  tran_low_t x7 = input[6];
-  tran_low_t x8 = input[7];
-  tran_low_t x9 = input[8];
-  tran_low_t x10 = input[5];
-  tran_low_t x11 = input[10];
-  tran_low_t x12 = input[3];
-  tran_low_t x13 = input[12];
-  tran_low_t x14 = input[1];
-  tran_low_t x15 = input[14];
-  (void)bd;
-
-  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
-        x13 | x14 | x15)) {
-    memset(output, 0, 16 * sizeof(*output));
-    return;
-  }
-
-  // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
-  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
-  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
-
-  // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4;
-  s5 = x5;
-  s6 = x6;
-  s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
-  x0 = WRAPLOW(s0 + s4, bd);
-  x1 = WRAPLOW(s1 + s5, bd);
-  x2 = WRAPLOW(s2 + s6, bd);
-  x3 = WRAPLOW(s3 + s7, bd);
-  x4 = WRAPLOW(s0 - s4, bd);
-  x5 = WRAPLOW(s1 - s5, bd);
-  x6 = WRAPLOW(s2 - s6, bd);
-  x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
-  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
-
-  // stage 3
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s8 = x8;
-  s9 = x9;
-  s10 = x10;
-  s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
-  x0 = WRAPLOW(s0 + s2, bd);
-  x1 = WRAPLOW(s1 + s3, bd);
-  x2 = WRAPLOW(s0 - s2, bd);
-  x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
-  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
-  x8 = WRAPLOW(s8 + s10, bd);
-  x9 = WRAPLOW(s9 + s11, bd);
-  x10 = WRAPLOW(s8 - s10, bd);
-  x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
-  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
-
-  // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
-
-  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
-  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
-  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
-  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
-  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
-  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
-  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
-  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
-
-  output[0] = WRAPLOW(x0, bd);
-  output[1] = WRAPLOW(-x8, bd);
-  output[2] = WRAPLOW(x12, bd);
-  output[3] = WRAPLOW(-x4, bd);
-  output[4] = WRAPLOW(x6, bd);
-  output[5] = WRAPLOW(x14, bd);
-  output[6] = WRAPLOW(x10, bd);
-  output[7] = WRAPLOW(x2, bd);
-  output[8] = WRAPLOW(x3, bd);
-  output[9] = WRAPLOW(x11, bd);
-  output[10] = WRAPLOW(x15, bd);
-  output[11] = WRAPLOW(x7, bd);
-  output[12] = WRAPLOW(x5, bd);
-  output[13] = WRAPLOW(-x13, bd);
-  output[14] = WRAPLOW(x9, bd);
-  output[15] = WRAPLOW(-x1, bd);
-}
-
-void av1_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    av1_highbd_idct16_c(input, outptr, bd);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns.
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    av1_highbd_idct16_c(temp_in, temp_out, bd);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bd) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-
-static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
-                            int bd) {
-  tran_low_t step1[32], step2[32];
-  tran_high_t temp1, temp2;
-  (void)bd;
-
-  // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  // stage 2
-  step2[0] = step1[0];
-  step2[1] = step1[1];
-  step2[2] = step1[2];
-  step2[3] = step1[3];
-  step2[4] = step1[4];
-  step2[5] = step1[5];
-  step2[6] = step1[6];
-  step2[7] = step1[7];
-
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step2[16] = WRAPLOW(step1[16] + step1[17], bd);
-  step2[17] = WRAPLOW(step1[16] - step1[17], bd);
-  step2[18] = WRAPLOW(-step1[18] + step1[19], bd);
-  step2[19] = WRAPLOW(step1[18] + step1[19], bd);
-  step2[20] = WRAPLOW(step1[20] + step1[21], bd);
-  step2[21] = WRAPLOW(step1[20] - step1[21], bd);
-  step2[22] = WRAPLOW(-step1[22] + step1[23], bd);
-  step2[23] = WRAPLOW(step1[22] + step1[23], bd);
-  step2[24] = WRAPLOW(step1[24] + step1[25], bd);
-  step2[25] = WRAPLOW(step1[24] - step1[25], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[27], bd);
-  step2[27] = WRAPLOW(step1[26] + step1[27], bd);
-  step2[28] = WRAPLOW(step1[28] + step1[29], bd);
-  step2[29] = WRAPLOW(step1[28] - step1[29], bd);
-  step2[30] = WRAPLOW(-step1[30] + step1[31], bd);
-  step2[31] = WRAPLOW(step1[30] + step1[31], bd);
-
-  // stage 3
-  step1[0] = step2[0];
-  step1[1] = step2[1];
-  step1[2] = step2[2];
-  step1[3] = step2[3];
-
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-
-  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
-  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
-  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
-  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
-  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
-  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
-  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
-  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step1[19] = step2[19];
-  step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[27] = step2[27];
-  step1[28] = step2[28];
-
-  // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
-  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
-  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
-  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
-
-  step2[8] = step1[8];
-  step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[11] = step1[11];
-  step2[12] = step1[12];
-
-  step2[16] = WRAPLOW(step1[16] + step1[19], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[18], bd);
-  step2[18] = WRAPLOW(step1[17] - step1[18], bd);
-  step2[19] = WRAPLOW(step1[16] - step1[19], bd);
-  step2[20] = WRAPLOW(-step1[20] + step1[23], bd);
-  step2[21] = WRAPLOW(-step1[21] + step1[22], bd);
-  step2[22] = WRAPLOW(step1[21] + step1[22], bd);
-  step2[23] = WRAPLOW(step1[20] + step1[23], bd);
-
-  step2[24] = WRAPLOW(step1[24] + step1[27], bd);
-  step2[25] = WRAPLOW(step1[25] + step1[26], bd);
-  step2[26] = WRAPLOW(step1[25] - step1[26], bd);
-  step2[27] = WRAPLOW(step1[24] - step1[27], bd);
-  step2[28] = WRAPLOW(-step1[28] + step1[31], bd);
-  step2[29] = WRAPLOW(-step1[29] + step1[30], bd);
-  step2[30] = WRAPLOW(step1[29] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[28] + step1[31], bd);
-
-  // stage 5
-  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
-  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
-  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
-  step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step1[7] = step2[7];
-
-  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
-  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
-  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
-  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
-  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
-  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
-  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step1[22] = step2[22];
-  step1[23] = step2[23];
-  step1[24] = step2[24];
-  step1[25] = step2[25];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // stage 6
-  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
-  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
-  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
-  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
-  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
-  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
-  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
-  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
-  step2[8] = step1[8];
-  step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step2[14] = step1[14];
-  step2[15] = step1[15];
-
-  step2[16] = WRAPLOW(step1[16] + step1[23], bd);
-  step2[17] = WRAPLOW(step1[17] + step1[22], bd);
-  step2[18] = WRAPLOW(step1[18] + step1[21], bd);
-  step2[19] = WRAPLOW(step1[19] + step1[20], bd);
-  step2[20] = WRAPLOW(step1[19] - step1[20], bd);
-  step2[21] = WRAPLOW(step1[18] - step1[21], bd);
-  step2[22] = WRAPLOW(step1[17] - step1[22], bd);
-  step2[23] = WRAPLOW(step1[16] - step1[23], bd);
-
-  step2[24] = WRAPLOW(-step1[24] + step1[31], bd);
-  step2[25] = WRAPLOW(-step1[25] + step1[30], bd);
-  step2[26] = WRAPLOW(-step1[26] + step1[29], bd);
-  step2[27] = WRAPLOW(-step1[27] + step1[28], bd);
-  step2[28] = WRAPLOW(step1[27] + step1[28], bd);
-  step2[29] = WRAPLOW(step1[26] + step1[29], bd);
-  step2[30] = WRAPLOW(step1[25] + step1[30], bd);
-  step2[31] = WRAPLOW(step1[24] + step1[31], bd);
-
-  // stage 7
-  step1[0] = WRAPLOW(step2[0] + step2[15], bd);
-  step1[1] = WRAPLOW(step2[1] + step2[14], bd);
-  step1[2] = WRAPLOW(step2[2] + step2[13], bd);
-  step1[3] = WRAPLOW(step2[3] + step2[12], bd);
-  step1[4] = WRAPLOW(step2[4] + step2[11], bd);
-  step1[5] = WRAPLOW(step2[5] + step2[10], bd);
-  step1[6] = WRAPLOW(step2[6] + step2[9], bd);
-  step1[7] = WRAPLOW(step2[7] + step2[8], bd);
-  step1[8] = WRAPLOW(step2[7] - step2[8], bd);
-  step1[9] = WRAPLOW(step2[6] - step2[9], bd);
-  step1[10] = WRAPLOW(step2[5] - step2[10], bd);
-  step1[11] = WRAPLOW(step2[4] - step2[11], bd);
-  step1[12] = WRAPLOW(step2[3] - step2[12], bd);
-  step1[13] = WRAPLOW(step2[2] - step2[13], bd);
-  step1[14] = WRAPLOW(step2[1] - step2[14], bd);
-  step1[15] = WRAPLOW(step2[0] - step2[15], bd);
-
-  step1[16] = step2[16];
-  step1[17] = step2[17];
-  step1[18] = step2[18];
-  step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
-  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
-  step1[28] = step2[28];
-  step1[29] = step2[29];
-  step1[30] = step2[30];
-  step1[31] = step2[31];
-
-  // final stage
-  output[0] = WRAPLOW(step1[0] + step1[31], bd);
-  output[1] = WRAPLOW(step1[1] + step1[30], bd);
-  output[2] = WRAPLOW(step1[2] + step1[29], bd);
-  output[3] = WRAPLOW(step1[3] + step1[28], bd);
-  output[4] = WRAPLOW(step1[4] + step1[27], bd);
-  output[5] = WRAPLOW(step1[5] + step1[26], bd);
-  output[6] = WRAPLOW(step1[6] + step1[25], bd);
-  output[7] = WRAPLOW(step1[7] + step1[24], bd);
-  output[8] = WRAPLOW(step1[8] + step1[23], bd);
-  output[9] = WRAPLOW(step1[9] + step1[22], bd);
-  output[10] = WRAPLOW(step1[10] + step1[21], bd);
-  output[11] = WRAPLOW(step1[11] + step1[20], bd);
-  output[12] = WRAPLOW(step1[12] + step1[19], bd);
-  output[13] = WRAPLOW(step1[13] + step1[18], bd);
-  output[14] = WRAPLOW(step1[14] + step1[17], bd);
-  output[15] = WRAPLOW(step1[15] + step1[16], bd);
-  output[16] = WRAPLOW(step1[15] - step1[16], bd);
-  output[17] = WRAPLOW(step1[14] - step1[17], bd);
-  output[18] = WRAPLOW(step1[13] - step1[18], bd);
-  output[19] = WRAPLOW(step1[12] - step1[19], bd);
-  output[20] = WRAPLOW(step1[11] - step1[20], bd);
-  output[21] = WRAPLOW(step1[10] - step1[21], bd);
-  output[22] = WRAPLOW(step1[9] - step1[22], bd);
-  output[23] = WRAPLOW(step1[8] - step1[23], bd);
-  output[24] = WRAPLOW(step1[7] - step1[24], bd);
-  output[25] = WRAPLOW(step1[6] - step1[25], bd);
-  output[26] = WRAPLOW(step1[5] - step1[26], bd);
-  output[27] = WRAPLOW(step1[4] - step1[27], bd);
-  output[28] = WRAPLOW(step1[3] - step1[28], bd);
-  output[29] = WRAPLOW(step1[2] - step1[29], bd);
-  output[30] = WRAPLOW(step1[1] - step1[30], bd);
-  output[31] = WRAPLOW(step1[0] - step1[31], bd);
-}
-
-void av1_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
-                                     int stride, int bd) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_low_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      highbd_idct32_c(input, outptr, bd);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
-                                   int stride, int bd) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  // Rows
-  // Only upper-left 8x8 has non-zero coeff.
-  for (i = 0; i < 8; ++i) {
-    highbd_idct32_c(input, outptr, bd);
-    input += 32;
-    outptr += 32;
-  }
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-    }
-  }
-}
-
-void av1_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
-                                  int stride, int bd) {
-  int i, j;
-  int a1;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
-  tran_low_t out =
-      WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
-  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
-    dest += stride;
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH

diff --git a/av1/common/av1_inv_txfm.h b/av1/common/av1_inv_txfm.h
deleted file mode 100644
index 9470d29..0000000
--- a/av1/common/av1_inv_txfm.h
+++ /dev/null

@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_INV_TXFM_H_
-#define AOM_DSP_INV_TXFM_H_
-
-#include <assert.h>
-
-#include "./aom_config.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE tran_low_t check_range(tran_high_t input) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid input streams, intermediate stage coefficients should always
-  // stay within the range of a signed 16 bit integer. Coefficients can go out
-  // of this range for invalid/corrupt streams. However, strictly checking
-  // this range for every intermediate coefficient can burdensome for a decoder,
-  // therefore the following assertion is only enabled when configured with
-  // --enable-coefficient-range-checking.
-  assert(INT16_MIN <= input);
-  assert(input <= INT16_MAX);
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  return (tran_low_t)input;
-}
-
-static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return check_range(rv);
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE tran_low_t highbd_check_range(tran_high_t input, int bd) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
-  // For valid highbitdepth streams, intermediate stage coefficients will
-  // stay within the ranges:
-  // - 8 bit: signed 16 bit integer
-  // - 10 bit: signed 18 bit integer
-  // - 12 bit: signed 20 bit integer
-  const int32_t int_max = (1 << (7 + bd)) - 1;
-  const int32_t int_min = -int_max - 1;
-  assert(int_min <= input);
-  assert(input <= int_max);
-  (void)int_min;
-#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
-  (void)bd;
-  return (tran_low_t)input;
-}
-
-static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
-                                                      int bd) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return highbd_check_range(rv, bd);
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows  in the inverse transform is considered invalid,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) ((int32_t)(x))
-#endif  // CONFIG_EMULATE_HARDWARE
-
-void av1_idct4_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct8_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct16_c(const tran_low_t *input, tran_low_t *output);
-void av1_idct32_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst4_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst8_c(const tran_low_t *input, tran_low_t *output);
-void av1_iadst16_c(const tran_low_t *input, tran_low_t *output);
-
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-void av1_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
-void av1_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd);
-
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
-                                             int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
-}
-#endif
-
-static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
-  trans = WRAPLOW(trans, 8);
-  return clip_pixel(WRAPLOW(dest + trans, 8));
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-#endif  // AOM_DSP_INV_TXFM_H_

diff --git a/av1/common/av1_inv_txfm1d.c b/av1/common/av1_inv_txfm1d.c
new file mode 100644
index 0000000..40d8403
--- /dev/null
+++ b/av1/common/av1_inv_txfm1d.c

@@ -0,0 +1,2333 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_inv_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#define range_check(stage, input, buf, size, bit)                         \
+  {                                                                       \
+    int i, j;                                                             \
+    for (i = 0; i < size; ++i) {                                          \
+      int buf_bit = get_max_bit(abs(buf[i])) + 1;                         \
+      if (buf_bit > bit) {                                                \
+        printf("======== %s %d overflow ========\n", __FILE__, __LINE__); \
+        printf("stage: %d node: %d\n", stage, i);                         \
+        printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
+        printf("input:\n");                                               \
+        for (j = 0; j < size; j++) {                                      \
+          printf("%d,", input[j]);                                        \
+        }                                                                 \
+        printf("\n");                                                     \
+        assert(0);                                                        \
+      }                                                                   \
+    }                                                                     \
+  }
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
+  }
+#endif
+
+// TODO(angiebird): Make 1-d txfm functions static
+void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[2];
+  bf1[2] = input[1];
+  bf1[3] = input[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[4];
+  bf1[2] = input[2];
+  bf1[3] = input[6];
+  bf1[4] = input[1];
+  bf1[5] = input[5];
+  bf1[6] = input[3];
+  bf1[7] = input[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_idct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[8];
+  bf1[2] = input[4];
+  bf1[3] = input[12];
+  bf1[4] = input[2];
+  bf1[5] = input[10];
+  bf1[6] = input[6];
+  bf1[7] = input[14];
+  bf1[8] = input[1];
+  bf1[9] = input[9];
+  bf1[10] = input[5];
+  bf1[11] = input[13];
+  bf1[12] = input[3];
+  bf1[13] = input[11];
+  bf1[14] = input[7];
+  bf1[15] = input[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_idct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[16];
+  bf1[2] = input[8];
+  bf1[3] = input[24];
+  bf1[4] = input[4];
+  bf1[5] = input[20];
+  bf1[6] = input[12];
+  bf1[7] = input[28];
+  bf1[8] = input[2];
+  bf1[9] = input[18];
+  bf1[10] = input[10];
+  bf1[11] = input[26];
+  bf1[12] = input[6];
+  bf1[13] = input[22];
+  bf1[14] = input[14];
+  bf1[15] = input[30];
+  bf1[16] = input[1];
+  bf1[17] = input[17];
+  bf1[18] = input[9];
+  bf1[19] = input[25];
+  bf1[20] = input[5];
+  bf1[21] = input[21];
+  bf1[22] = input[13];
+  bf1[23] = input[29];
+  bf1[24] = input[3];
+  bf1[25] = input[19];
+  bf1[26] = input[11];
+  bf1[27] = input[27];
+  bf1[28] = input[7];
+  bf1[29] = input[23];
+  bf1[30] = input[15];
+  bf1[31] = input[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[3];
+  bf1[2] = -input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_iadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[31];
+  bf1[2] = -input[15];
+  bf1[3] = input[16];
+  bf1[4] = -input[7];
+  bf1[5] = input[24];
+  bf1[6] = input[8];
+  bf1[7] = -input[23];
+  bf1[8] = -input[3];
+  bf1[9] = input[28];
+  bf1[10] = input[12];
+  bf1[11] = -input[19];
+  bf1[12] = input[4];
+  bf1[13] = -input[27];
+  bf1[14] = -input[11];
+  bf1[15] = input[20];
+  bf1[16] = -input[1];
+  bf1[17] = input[30];
+  bf1[18] = input[14];
+  bf1[19] = -input[17];
+  bf1[20] = input[6];
+  bf1[21] = -input[25];
+  bf1[22] = -input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[2];
+  bf1[25] = -input[29];
+  bf1[26] = -input[13];
+  bf1[27] = input[18];
+  bf1[28] = -input[5];
+  bf1[29] = input[26];
+  bf1[30] = input[10];
+  bf1[31] = -input[21];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = bf0[16] - bf0[18];
+  bf1[19] = bf0[17] - bf0[19];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = bf0[20] - bf0[22];
+  bf1[23] = bf0[21] - bf0[23];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = bf0[24] - bf0[26];
+  bf1[27] = bf0[25] - bf0[27];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = bf0[28] - bf0[30];
+  bf1[31] = bf0[29] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = bf0[16] - bf0[20];
+  bf1[21] = bf0[17] - bf0[21];
+  bf1[22] = bf0[18] - bf0[22];
+  bf1[23] = bf0[19] - bf0[23];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = bf0[24] - bf0[28];
+  bf1[29] = bf0[25] - bf0[29];
+  bf1[30] = bf0[26] - bf0[30];
+  bf1[31] = bf0[27] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = bf0[16] - bf0[24];
+  bf1[25] = bf0[17] - bf0[25];
+  bf1[26] = bf0[18] - bf0[26];
+  bf1[27] = bf0[19] - bf0[27];
+  bf1[28] = bf0[20] - bf0[28];
+  bf1[29] = bf0[21] - bf0[29];
+  bf1[30] = bf0[22] - bf0[30];
+  bf1[31] = bf0[23] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = bf0[0] - bf0[16];
+  bf1[17] = bf0[1] - bf0[17];
+  bf1[18] = bf0[2] - bf0[18];
+  bf1[19] = bf0[3] - bf0[19];
+  bf1[20] = bf0[4] - bf0[20];
+  bf1[21] = bf0[5] - bf0[21];
+  bf1[22] = bf0[6] - bf0[22];
+  bf1[23] = bf0[7] - bf0[23];
+  bf1[24] = bf0[8] - bf0[24];
+  bf1[25] = bf0[9] - bf0[25];
+  bf1[26] = bf0[10] - bf0[26];
+  bf1[27] = bf0[11] - bf0[27];
+  bf1[28] = bf0[12] - bf0[28];
+  bf1[29] = bf0[13] - bf0[29];
+  bf1[30] = bf0[14] - bf0[30];
+  bf1[31] = bf0[15] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[30];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[28];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[26];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[24];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[22];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[20];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[18];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[16];
+  bf1[16] = bf0[17];
+  bf1[17] = bf0[14];
+  bf1[18] = bf0[19];
+  bf1[19] = bf0[12];
+  bf1[20] = bf0[21];
+  bf1[21] = bf0[10];
+  bf1[22] = bf0[23];
+  bf1[23] = bf0[8];
+  bf1[24] = bf0[25];
+  bf1[25] = bf0[6];
+  bf1[26] = bf0[27];
+  bf1[27] = bf0[4];
+  bf1[28] = bf0[29];
+  bf1[29] = bf0[2];
+  bf1[30] = bf0[31];
+  bf1[31] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+#if CONFIG_TX64X64
+void av1_idct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[32];
+  bf1[2] = input[16];
+  bf1[3] = input[48];
+  bf1[4] = input[8];
+  bf1[5] = input[40];
+  bf1[6] = input[24];
+  bf1[7] = input[56];
+  bf1[8] = input[4];
+  bf1[9] = input[36];
+  bf1[10] = input[20];
+  bf1[11] = input[52];
+  bf1[12] = input[12];
+  bf1[13] = input[44];
+  bf1[14] = input[28];
+  bf1[15] = input[60];
+  bf1[16] = input[2];
+  bf1[17] = input[34];
+  bf1[18] = input[18];
+  bf1[19] = input[50];
+  bf1[20] = input[10];
+  bf1[21] = input[42];
+  bf1[22] = input[26];
+  bf1[23] = input[58];
+  bf1[24] = input[6];
+  bf1[25] = input[38];
+  bf1[26] = input[22];
+  bf1[27] = input[54];
+  bf1[28] = input[14];
+  bf1[29] = input[46];
+  bf1[30] = input[30];
+  bf1[31] = input[62];
+  bf1[32] = input[1];
+  bf1[33] = input[33];
+  bf1[34] = input[17];
+  bf1[35] = input[49];
+  bf1[36] = input[9];
+  bf1[37] = input[41];
+  bf1[38] = input[25];
+  bf1[39] = input[57];
+  bf1[40] = input[5];
+  bf1[41] = input[37];
+  bf1[42] = input[21];
+  bf1[43] = input[53];
+  bf1[44] = input[13];
+  bf1[45] = input[45];
+  bf1[46] = input[29];
+  bf1[47] = input[61];
+  bf1[48] = input[3];
+  bf1[49] = input[35];
+  bf1[50] = input[19];
+  bf1[51] = input[51];
+  bf1[52] = input[11];
+  bf1[53] = input[43];
+  bf1[54] = input[27];
+  bf1[55] = input[59];
+  bf1[56] = input[7];
+  bf1[57] = input[39];
+  bf1[58] = input[23];
+  bf1[59] = input[55];
+  bf1[60] = input[15];
+  bf1[61] = input[47];
+  bf1[62] = input[31];
+  bf1[63] = input[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = bf0[32] - bf0[33];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[34] + bf0[35];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = bf0[36] - bf0[37];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[38] + bf0[39];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = bf0[40] - bf0[41];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[42] + bf0[43];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = bf0[44] - bf0[45];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[46] + bf0[47];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = bf0[48] - bf0[49];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[50] + bf0[51];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = bf0[52] - bf0[53];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[54] + bf0[55];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = bf0[56] - bf0[57];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[58] + bf0[59];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = bf0[60] - bf0[61];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[62] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = bf0[33] - bf0[34];
+  bf1[35] = bf0[32] - bf0[35];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[37] + bf0[38];
+  bf1[39] = bf0[36] + bf0[39];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = bf0[41] - bf0[42];
+  bf1[43] = bf0[40] - bf0[43];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[45] + bf0[46];
+  bf1[47] = bf0[44] + bf0[47];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = bf0[49] - bf0[50];
+  bf1[51] = bf0[48] - bf0[51];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[53] + bf0[54];
+  bf1[55] = bf0[52] + bf0[55];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = bf0[57] - bf0[58];
+  bf1[59] = bf0[56] - bf0[59];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[61] + bf0[62];
+  bf1[63] = bf0[60] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = bf0[35] - bf0[36];
+  bf1[37] = bf0[34] - bf0[37];
+  bf1[38] = bf0[33] - bf0[38];
+  bf1[39] = bf0[32] - bf0[39];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[43] + bf0[44];
+  bf1[45] = bf0[42] + bf0[45];
+  bf1[46] = bf0[41] + bf0[46];
+  bf1[47] = bf0[40] + bf0[47];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = bf0[51] - bf0[52];
+  bf1[53] = bf0[50] - bf0[53];
+  bf1[54] = bf0[49] - bf0[54];
+  bf1[55] = bf0[48] - bf0[55];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[59] + bf0[60];
+  bf1[61] = bf0[58] + bf0[61];
+  bf1[62] = bf0[57] + bf0[62];
+  bf1[63] = bf0[56] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = bf0[39] - bf0[40];
+  bf1[41] = bf0[38] - bf0[41];
+  bf1[42] = bf0[37] - bf0[42];
+  bf1[43] = bf0[36] - bf0[43];
+  bf1[44] = bf0[35] - bf0[44];
+  bf1[45] = bf0[34] - bf0[45];
+  bf1[46] = bf0[33] - bf0[46];
+  bf1[47] = bf0[32] - bf0[47];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[55] + bf0[56];
+  bf1[57] = bf0[54] + bf0[57];
+  bf1[58] = bf0[53] + bf0[58];
+  bf1[59] = bf0[52] + bf0[59];
+  bf1[60] = bf0[51] + bf0[60];
+  bf1[61] = bf0[50] + bf0[61];
+  bf1[62] = bf0[49] + bf0[62];
+  bf1[63] = bf0[48] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[63];
+  bf1[1] = bf0[1] + bf0[62];
+  bf1[2] = bf0[2] + bf0[61];
+  bf1[3] = bf0[3] + bf0[60];
+  bf1[4] = bf0[4] + bf0[59];
+  bf1[5] = bf0[5] + bf0[58];
+  bf1[6] = bf0[6] + bf0[57];
+  bf1[7] = bf0[7] + bf0[56];
+  bf1[8] = bf0[8] + bf0[55];
+  bf1[9] = bf0[9] + bf0[54];
+  bf1[10] = bf0[10] + bf0[53];
+  bf1[11] = bf0[11] + bf0[52];
+  bf1[12] = bf0[12] + bf0[51];
+  bf1[13] = bf0[13] + bf0[50];
+  bf1[14] = bf0[14] + bf0[49];
+  bf1[15] = bf0[15] + bf0[48];
+  bf1[16] = bf0[16] + bf0[47];
+  bf1[17] = bf0[17] + bf0[46];
+  bf1[18] = bf0[18] + bf0[45];
+  bf1[19] = bf0[19] + bf0[44];
+  bf1[20] = bf0[20] + bf0[43];
+  bf1[21] = bf0[21] + bf0[42];
+  bf1[22] = bf0[22] + bf0[41];
+  bf1[23] = bf0[23] + bf0[40];
+  bf1[24] = bf0[24] + bf0[39];
+  bf1[25] = bf0[25] + bf0[38];
+  bf1[26] = bf0[26] + bf0[37];
+  bf1[27] = bf0[27] + bf0[36];
+  bf1[28] = bf0[28] + bf0[35];
+  bf1[29] = bf0[29] + bf0[34];
+  bf1[30] = bf0[30] + bf0[33];
+  bf1[31] = bf0[31] + bf0[32];
+  bf1[32] = bf0[31] - bf0[32];
+  bf1[33] = bf0[30] - bf0[33];
+  bf1[34] = bf0[29] - bf0[34];
+  bf1[35] = bf0[28] - bf0[35];
+  bf1[36] = bf0[27] - bf0[36];
+  bf1[37] = bf0[26] - bf0[37];
+  bf1[38] = bf0[25] - bf0[38];
+  bf1[39] = bf0[24] - bf0[39];
+  bf1[40] = bf0[23] - bf0[40];
+  bf1[41] = bf0[22] - bf0[41];
+  bf1[42] = bf0[21] - bf0[42];
+  bf1[43] = bf0[20] - bf0[43];
+  bf1[44] = bf0[19] - bf0[44];
+  bf1[45] = bf0[18] - bf0[45];
+  bf1[46] = bf0[17] - bf0[46];
+  bf1[47] = bf0[16] - bf0[47];
+  bf1[48] = bf0[15] - bf0[48];
+  bf1[49] = bf0[14] - bf0[49];
+  bf1[50] = bf0[13] - bf0[50];
+  bf1[51] = bf0[12] - bf0[51];
+  bf1[52] = bf0[11] - bf0[52];
+  bf1[53] = bf0[10] - bf0[53];
+  bf1[54] = bf0[9] - bf0[54];
+  bf1[55] = bf0[8] - bf0[55];
+  bf1[56] = bf0[7] - bf0[56];
+  bf1[57] = bf0[6] - bf0[57];
+  bf1[58] = bf0[5] - bf0[58];
+  bf1[59] = bf0[4] - bf0[59];
+  bf1[60] = bf0[3] - bf0[60];
+  bf1[61] = bf0[2] - bf0[61];
+  bf1[62] = bf0[1] - bf0[62];
+  bf1[63] = bf0[0] - bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+#endif  // CONFIG_TX64X64

diff --git a/av1/common/av1_inv_txfm1d.h b/av1/common/av1_inv_txfm1d.h
new file mode 100644
index 0000000..5937617
--- /dev/null
+++ b/av1/common/av1_inv_txfm1d.h

@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_INV_TXFM1D_H_
+#define AV1_INV_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_idct4_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_idct8_new(const int32_t *input, int32_t *output, const int8_t *cos_bit,
+                   const int8_t *stage_range);
+void av1_idct16_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct32_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct64_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_INV_TXFM1D_H_

diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
new file mode 100644
index 0000000..844a38a
--- /dev/null
+++ b/av1/common/av1_inv_txfm2d.c

@@ -0,0 +1,191 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+
+static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4: return av1_idct4_new;
+    case TXFM_TYPE_DCT8: return av1_idct8_new;
+    case TXFM_TYPE_DCT16: return av1_idct16_new;
+    case TXFM_TYPE_DCT32: return av1_idct32_new;
+    case TXFM_TYPE_ADST4: return av1_iadst4_new;
+    case TXFM_TYPE_ADST8: return av1_iadst8_new;
+    case TXFM_TYPE_ADST16: return av1_iadst16_new;
+    case TXFM_TYPE_ADST32: return av1_iadst32_new;
+    default: assert(0); return NULL;
+  }
+}
+
+#if CONFIG_EXT_TX
+static const TXFM_2D_CFG *inv_txfm_cfg_ls[FLIPADST_ADST + 1][TX_SIZES] = {
+  { &inv_txfm_2d_cfg_dct_dct_4, &inv_txfm_2d_cfg_dct_dct_8,
+    &inv_txfm_2d_cfg_dct_dct_16, &inv_txfm_2d_cfg_dct_dct_32 },
+  { &inv_txfm_2d_cfg_adst_dct_4, &inv_txfm_2d_cfg_adst_dct_8,
+    &inv_txfm_2d_cfg_adst_dct_16, &inv_txfm_2d_cfg_adst_dct_32 },
+  { &inv_txfm_2d_cfg_dct_adst_4, &inv_txfm_2d_cfg_dct_adst_8,
+    &inv_txfm_2d_cfg_dct_adst_16, &inv_txfm_2d_cfg_dct_adst_32 },
+  { &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { &inv_txfm_2d_cfg_adst_dct_4, &inv_txfm_2d_cfg_adst_dct_8,
+    &inv_txfm_2d_cfg_adst_dct_16, &inv_txfm_2d_cfg_adst_dct_32 },
+  { &inv_txfm_2d_cfg_dct_adst_4, &inv_txfm_2d_cfg_dct_adst_8,
+    &inv_txfm_2d_cfg_dct_adst_16, &inv_txfm_2d_cfg_dct_adst_32 },
+  { &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+  { &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+};
+#else
+static const TXFM_2D_CFG *inv_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+  { &inv_txfm_2d_cfg_dct_dct_4, &inv_txfm_2d_cfg_dct_dct_8,
+    &inv_txfm_2d_cfg_dct_dct_16, &inv_txfm_2d_cfg_dct_dct_32 },
+  { &inv_txfm_2d_cfg_adst_dct_4, &inv_txfm_2d_cfg_adst_dct_8,
+    &inv_txfm_2d_cfg_adst_dct_16, &inv_txfm_2d_cfg_adst_dct_32 },
+  { &inv_txfm_2d_cfg_dct_adst_4, &inv_txfm_2d_cfg_dct_adst_8,
+    &inv_txfm_2d_cfg_dct_adst_16, &inv_txfm_2d_cfg_dct_adst_32 },
+  { &inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+    &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32 },
+};
+#endif
+
+TXFM_2D_FLIP_CFG av1_get_inv_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = inv_txfm_cfg_ls[tx_type][tx_size];
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL };
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.cfg = &inv_txfm_2d_cfg_dct_dct_64;
+      set_flip_cfg(tx_type, &cfg);
+      break;
+    default: assert(0);
+  }
+  return cfg;
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
+                                    int stride, TXFM_2D_FLIP_CFG *cfg,
+                                    int32_t *txfm_buf) {
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->cfg->txfm_type_row);
+
+  // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
+  // it is used for intermediate data buffering
+  int32_t *temp_in = txfm_buf;
+  int32_t *temp_out = temp_in + txfm_size;
+  int32_t *buf = temp_out + txfm_size;
+  int32_t *buf_ptr = buf;
+  int c, r;
+
+  // Rows
+  for (r = 0; r < txfm_size; ++r) {
+    txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
+    round_shift_array(buf_ptr, txfm_size, -shift[0]);
+    input += txfm_size;
+    buf_ptr += txfm_size;
+  }
+
+  // Columns
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) temp_in[r] = buf[r * txfm_size + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = buf[r * txfm_size + (txfm_size - c - 1)];
+    }
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r) output[r * stride + c] += temp_out[r];
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size; ++r)
+        output[r * stride + c] += temp_out[txfm_size - r - 1];
+    }
+  }
+}
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
+                              int stride, int tx_type, int bd) {
+  int txfm_buf[4 * 4 + 4 + 4];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, TX_4X4);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 4, stride, 0, (1 << bd) - 1);
+}
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
+                              int stride, int tx_type, int bd) {
+  int txfm_buf[8 * 8 + 8 + 8];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, TX_8X8);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 8, stride, 0, (1 << bd) - 1);
+}
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
+                                int stride, int tx_type, int bd) {
+  int txfm_buf[16 * 16 + 16 + 16];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, TX_16X16);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 16, stride, 0, (1 << bd) - 1);
+}
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
+                                int stride, int tx_type, int bd) {
+  int txfm_buf[32 * 32 + 32 + 32];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_cfg(tx_type, TX_32X32);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 32, stride, 0, (1 << bd) - 1);
+}
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+                                int stride, int tx_type, int bd) {
+  int txfm_buf[64 * 64 + 64 + 64];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_64x64_cfg(tx_type);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 64, stride, 0, (1 << bd) - 1);
+}

diff --git a/av1/common/av1_inv_txfm2d_cfg.h b/av1/common/av1_inv_txfm2d_cfg.h
new file mode 100644
index 0000000..ee018fb
--- /dev/null
+++ b/av1/common/av1_inv_txfm2d_cfg.h

@@ -0,0 +1,444 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_INV_TXFM2D_CFG_H_
+#define AV1_INV_TXFM2D_CFG_H_
+#include "av1/common/av1_inv_txfm1d.h"
+//  ---------------- config inv_dct_dct_4 ----------------
+static const int8_t inv_shift_dct_dct_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_dct_dct_4[4] = { 18, 18, 17, 17 };
+static const int8_t inv_stage_range_row_dct_dct_4[4] = { 18, 18, 18, 18 };
+static const int8_t inv_cos_bit_col_dct_dct_4[4] = { 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_dct_4,            // .shift
+  inv_stage_range_col_dct_dct_4,  // .stage_range_col
+  inv_stage_range_row_dct_dct_4,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_4,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_8 ----------------
+static const int8_t inv_shift_dct_dct_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_dct_dct_8[6] = {
+  19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_dct_8[6] = {
+  19, 19, 19, 19, 19, 19
+};
+static const int8_t inv_cos_bit_col_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_dct_8,            // .shift
+  inv_stage_range_col_dct_dct_8,  // .stage_range_col
+  inv_stage_range_row_dct_dct_8,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_8,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_16 ----------------
+static const int8_t inv_shift_dct_dct_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_dct_16[8] = { 19, 19, 19, 19,
+                                                          19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_dct_dct_16[8] = { 20, 20, 20, 20,
+                                                          20, 20, 20, 20 };
+static const int8_t inv_cos_bit_col_dct_dct_16[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_16[8] = { 12, 12, 12, 12,
+                                                      12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_dct_16,            // .shift
+  inv_stage_range_col_dct_dct_16,  // .stage_range_col
+  inv_stage_range_row_dct_dct_16,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_16,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_32 ----------------
+static const int8_t inv_shift_dct_dct_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_dct_32[10] = { 19, 19, 19, 19, 19,
+                                                           19, 19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_dct_dct_32[10] = { 20, 20, 20, 20, 20,
+                                                           20, 20, 20, 20, 20 };
+static const int8_t inv_cos_bit_col_dct_dct_32[10] = { 13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_dct_dct_32,            // .shift
+  inv_stage_range_col_dct_dct_32,  // .stage_range_col
+  inv_stage_range_row_dct_dct_32,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_32,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_64 ----------------
+static const int8_t inv_shift_dct_dct_64[2] = { -1, -7 };
+static const int8_t inv_stage_range_col_dct_dct_64[12] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_dct_64[12] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_dct_dct_64[12] = { 13, 13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_dct_64[12] = { 12, 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_64 = {
+  64,                              // .txfm_size
+  12,                              // .stage_num_col
+  12,                              // .stage_num_row
+  inv_shift_dct_dct_64,            // .shift
+  inv_stage_range_col_dct_dct_64,  // .stage_range_col
+  inv_stage_range_row_dct_dct_64,  // .stage_range_row
+  inv_cos_bit_col_dct_dct_64,      // .cos_bit_col
+  inv_cos_bit_row_dct_dct_64,      // .cos_bit_row
+  TXFM_TYPE_DCT64,                 // .txfm_type_col
+  TXFM_TYPE_DCT64
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_4 ----------------
+static const int8_t inv_shift_dct_adst_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_dct_adst_4[4] = { 18, 18, 17, 17 };
+static const int8_t inv_stage_range_row_dct_adst_4[6] = {
+  18, 18, 18, 18, 18, 18
+};
+static const int8_t inv_cos_bit_col_dct_adst_4[4] = { 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_4 = {
+  4,  // .txfm_size
+  4,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_adst_4,            // .shift
+  inv_stage_range_col_dct_adst_4,  // .stage_range_col
+  inv_stage_range_row_dct_adst_4,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_4,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_4,      // .cos_bit_row
+  TXFM_TYPE_DCT4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_8 ----------------
+static const int8_t inv_shift_dct_adst_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_dct_adst_8[6] = {
+  19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_adst_8[8] = { 19, 19, 19, 19,
+                                                          19, 19, 19, 19 };
+static const int8_t inv_cos_bit_col_dct_adst_8[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_8 = {
+  8,  // .txfm_size
+  6,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_adst_8,            // .shift
+  inv_stage_range_col_dct_adst_8,  // .stage_range_col
+  inv_stage_range_row_dct_adst_8,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_8,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_8,      // .cos_bit_row
+  TXFM_TYPE_DCT8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_16 ----------------
+static const int8_t inv_shift_dct_adst_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_adst_16[8] = { 19, 19, 19, 19,
+                                                           19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_dct_adst_16[10] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_dct_adst_16[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_16 = {
+  16,  // .txfm_size
+  8,   // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_dct_adst_16,            // .shift
+  inv_stage_range_col_dct_adst_16,  // .stage_range_col
+  inv_stage_range_row_dct_adst_16,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_16,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_16,      // .cos_bit_row
+  TXFM_TYPE_DCT16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_32 ----------------
+static const int8_t inv_shift_dct_adst_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_dct_adst_32[10] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_dct_adst_32[12] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_dct_adst_32[10] = { 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_dct_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_32 = {
+  32,  // .txfm_size
+  10,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_dct_adst_32,            // .shift
+  inv_stage_range_col_dct_adst_32,  // .stage_range_col
+  inv_stage_range_row_dct_adst_32,  // .stage_range_row
+  inv_cos_bit_col_dct_adst_32,      // .cos_bit_col
+  inv_cos_bit_row_dct_adst_32,      // .cos_bit_row
+  TXFM_TYPE_DCT32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_4 ----------------
+static const int8_t inv_shift_adst_adst_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_adst_adst_4[6] = { 18, 18, 18,
+                                                           18, 17, 17 };
+static const int8_t inv_stage_range_row_adst_adst_4[6] = { 18, 18, 18,
+                                                           18, 18, 18 };
+static const int8_t inv_cos_bit_col_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_adst_4[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_adst_4,            // .shift
+  inv_stage_range_col_adst_adst_4,  // .stage_range_col
+  inv_stage_range_row_adst_adst_4,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_4,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                  // .txfm_type_col
+  TXFM_TYPE_ADST4
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_8 ----------------
+static const int8_t inv_shift_adst_adst_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_adst_adst_8[8] = { 19, 19, 19, 19,
+                                                           19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_adst_adst_8[8] = { 19, 19, 19, 19,
+                                                           19, 19, 19, 19 };
+static const int8_t inv_cos_bit_col_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_adst_8[8] = { 13, 13, 13, 13,
+                                                       13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  8,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_adst_8,            // .shift
+  inv_stage_range_col_adst_adst_8,  // .stage_range_col
+  inv_stage_range_row_adst_adst_8,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_8,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                  // .txfm_type_col
+  TXFM_TYPE_ADST8
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_16 ----------------
+static const int8_t inv_shift_adst_adst_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_adst_16[10] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_adst_16[10] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_adst_adst_16[10] = { 13, 13, 13, 13, 13,
+                                                         13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_adst_16[10] = { 12, 12, 12, 12, 12,
+                                                         12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  10,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_adst_16,            // .shift
+  inv_stage_range_col_adst_adst_16,  // .stage_range_col
+  inv_stage_range_row_adst_adst_16,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_16,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                  // .txfm_type_col
+  TXFM_TYPE_ADST16
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_32 ----------------
+static const int8_t inv_shift_adst_adst_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_adst_32[12] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_adst_32[12] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_adst_adst_32[12] = {
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+};
+static const int8_t inv_cos_bit_row_adst_adst_32[12] = {
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12
+};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  12,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_adst_adst_32,            // .shift
+  inv_stage_range_col_adst_adst_32,  // .stage_range_col
+  inv_stage_range_row_adst_adst_32,  // .stage_range_row
+  inv_cos_bit_col_adst_adst_32,      // .cos_bit_col
+  inv_cos_bit_row_adst_adst_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                  // .txfm_type_col
+  TXFM_TYPE_ADST32
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_4 ----------------
+static const int8_t inv_shift_adst_dct_4[2] = { 0, -4 };
+static const int8_t inv_stage_range_col_adst_dct_4[6] = {
+  18, 18, 18, 18, 17, 17
+};
+static const int8_t inv_stage_range_row_adst_dct_4[4] = { 18, 18, 18, 18 };
+static const int8_t inv_cos_bit_col_adst_dct_4[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_dct_4[4] = { 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_4 = {
+  4,  // .txfm_size
+  6,  // .stage_num_col
+  4,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_dct_4,            // .shift
+  inv_stage_range_col_adst_dct_4,  // .stage_range_col
+  inv_stage_range_row_adst_dct_4,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_4,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_4,      // .cos_bit_row
+  TXFM_TYPE_ADST4,                 // .txfm_type_col
+  TXFM_TYPE_DCT4
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_8 ----------------
+static const int8_t inv_shift_adst_dct_8[2] = { 0, -5 };
+static const int8_t inv_stage_range_col_adst_dct_8[8] = { 19, 19, 19, 19,
+                                                          19, 19, 18, 18 };
+static const int8_t inv_stage_range_row_adst_dct_8[6] = {
+  19, 19, 19, 19, 19, 19
+};
+static const int8_t inv_cos_bit_col_adst_dct_8[8] = { 13, 13, 13, 13,
+                                                      13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_dct_8[6] = { 13, 13, 13, 13, 13, 13 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_8 = {
+  8,  // .txfm_size
+  8,  // .stage_num_col
+  6,  // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_dct_8,            // .shift
+  inv_stage_range_col_adst_dct_8,  // .stage_range_col
+  inv_stage_range_row_adst_dct_8,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_8,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_8,      // .cos_bit_row
+  TXFM_TYPE_ADST8,                 // .txfm_type_col
+  TXFM_TYPE_DCT8
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_16 ----------------
+static const int8_t inv_shift_adst_dct_16[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_dct_16[10] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_dct_16[8] = { 20, 20, 20, 20,
+                                                           20, 20, 20, 20 };
+static const int8_t inv_cos_bit_col_adst_dct_16[10] = { 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13 };
+static const int8_t inv_cos_bit_row_adst_dct_16[8] = { 12, 12, 12, 12,
+                                                       12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_16 = {
+  16,  // .txfm_size
+  10,  // .stage_num_col
+  8,   // .stage_num_row
+  // 0,  // .log_scale
+  inv_shift_adst_dct_16,            // .shift
+  inv_stage_range_col_adst_dct_16,  // .stage_range_col
+  inv_stage_range_row_adst_dct_16,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_16,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_16,      // .cos_bit_row
+  TXFM_TYPE_ADST16,                 // .txfm_type_col
+  TXFM_TYPE_DCT16
+};  // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_32 ----------------
+static const int8_t inv_shift_adst_dct_32[2] = { -1, -5 };
+static const int8_t inv_stage_range_col_adst_dct_32[12] = {
+  19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18
+};
+static const int8_t inv_stage_range_row_adst_dct_32[10] = {
+  20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+};
+static const int8_t inv_cos_bit_col_adst_dct_32[12] = {
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+};
+static const int8_t inv_cos_bit_row_adst_dct_32[10] = { 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12 };
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_32 = {
+  32,  // .txfm_size
+  12,  // .stage_num_col
+  10,  // .stage_num_row
+  // 1,  // .log_scale
+  inv_shift_adst_dct_32,            // .shift
+  inv_stage_range_col_adst_dct_32,  // .stage_range_col
+  inv_stage_range_row_adst_dct_32,  // .stage_range_row
+  inv_cos_bit_col_adst_dct_32,      // .cos_bit_col
+  inv_cos_bit_row_adst_dct_32,      // .cos_bit_row
+  TXFM_TYPE_ADST32,                 // .txfm_type_col
+  TXFM_TYPE_DCT32
+};  // .txfm_type_row
+
+#endif  // AV1_INV_TXFM2D_CFG_H_

diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 6b7623b..f96dcf2 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl

@@ -7,6 +7,9 @@
 #include "aom/aom_integer.h"
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/av1_txfm.h"
 
 struct macroblockd;
 
@@ -17,7 +20,6 @@
 struct mv;
 union int_mv;
 struct yv12_buffer_config;
-struct InterpFilterParams;
 typedef int16_t od_dering_in;
 EOF
 }
@@ -33,21 +35,24 @@
   $avx2_x86_64 = 'avx2';
 }
 
-add_proto qw/void av1_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const struct InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
+#
+# 10/12-tap convolution filters
+#
+add_proto qw/void av1_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
 specialize qw/av1_convolve_horiz ssse3/;
 
-add_proto qw/void av1_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const struct InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
+add_proto qw/void av1_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
 specialize qw/av1_convolve_vert ssse3/;
 
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const struct InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  add_proto qw/void av1_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
   specialize qw/av1_highbd_convolve_horiz sse4_1/;
-  add_proto qw/void av1_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const struct InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  add_proto qw/void av1_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
   specialize qw/av1_highbd_convolve_vert sse4_1/;
 }
 
 #
-# dct
+# Inverse dct
 #
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
@@ -56,6 +61,24 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add/;
 
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht4x8_32_add/;
+
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x4_32_add/;
+
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x16_128_add/;
+
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x8_128_add/;
+
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x32_512_add/;
+
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht32x16_512_add/;
+
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add/;
 
@@ -65,11 +88,32 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add sse2/;
 
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht4x8_32_add sse2/;
+
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x4_32_add sse2/;
+
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x16_128_add sse2/;
+
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x8_128_add sse2/;
+
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x32_512_add sse2/;
+
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht32x16_512_add sse2/;
+
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add sse2/;
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/av1_iht16x16_256_add/;
+    specialize qw/av1_iht16x16_256_add sse2 avx2/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
   }
 } else {
   # Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -77,20 +121,137 @@
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht4x4_16_add/;
 
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht4x8_32_add/;
+
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x4_32_add/;
+
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x16_128_add/;
+
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x8_128_add/;
+
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x32_512_add/;
+
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht32x16_512_add/;
+
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
     specialize qw/av1_iht8x8_64_add/;
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
     specialize qw/av1_iht16x16_256_add/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
+
   } else {
     add_proto qw/void av1_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/av1_iht4x4_16_add sse2 neon dspr2 msa/;
+    specialize qw/av1_iht4x4_16_add sse2 neon dspr2/;
+
+    add_proto qw/void av1_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht4x8_32_add sse2/;
+
+    add_proto qw/void av1_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x4_32_add sse2/;
+
+    add_proto qw/void av1_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht8x16_128_add sse2/;
+
+    add_proto qw/void av1_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x8_128_add sse2/;
+
+    add_proto qw/void av1_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht16x32_512_add sse2/;
+
+    add_proto qw/void av1_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+      specialize qw/av1_iht32x16_512_add sse2/;
 
     add_proto qw/void av1_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/av1_iht8x8_64_add sse2 neon dspr2 msa/;
+    specialize qw/av1_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void av1_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/av1_iht16x16_256_add sse2 dspr2 msa/;
+    specialize qw/av1_iht16x16_256_add sse2 avx2 dspr2/;
+
+    add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+    specialize qw/av1_iht32x32_1024_add/;
+
+    if (aom_config("CONFIG_EXT_TX") ne "yes") {
+      specialize qw/av1_iht4x4_16_add msa/;
+      specialize qw/av1_iht8x8_64_add msa/;
+      specialize qw/av1_iht16x16_256_add msa/;
+    }
+  }
+}
+add_proto qw/void av1_iht32x32_1024_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+specialize qw/av1_iht32x32_1024_add/;
+
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_iht64x64_4096_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/av1_iht64x64_4096_add/;
+}
+
+if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
+  add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_nuq/;
+
+  add_proto qw/void quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_fp_nuq/;
+
+  add_proto qw/void quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_32x32_nuq/;
+
+  add_proto qw/void quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_32x32_fp_nuq/;
+}
+
+# FILTER_INTRA predictor functions
+if (aom_config("CONFIG_FILTER_INTRA") eq "yes") {
+  add_proto qw/void av1_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_dc_filter_predictor sse4_1/;
+  add_proto qw/void av1_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_v_filter_predictor sse4_1/;
+  add_proto qw/void av1_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_h_filter_predictor sse4_1/;
+  add_proto qw/void av1_d45_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_d45_filter_predictor sse4_1/;
+  add_proto qw/void av1_d135_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_d135_filter_predictor sse4_1/;
+  add_proto qw/void av1_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_d117_filter_predictor sse4_1/;
+  add_proto qw/void av1_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_d153_filter_predictor sse4_1/;
+  add_proto qw/void av1_d207_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_d207_filter_predictor sse4_1/;
+  add_proto qw/void av1_d63_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_d63_filter_predictor sse4_1/;
+  add_proto qw/void av1_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+  specialize qw/av1_tm_filter_predictor sse4_1/;
+  # High bitdepth functions
+  if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void av1_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_dc_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_v_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_h_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_d45_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_d45_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_d135_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_d135_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_d117_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_d153_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_d207_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_d207_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_d63_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_d63_filter_predictor sse4_1/;
+    add_proto qw/void av1_highbd_tm_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left, int bd";
+    specialize qw/av1_highbd_tm_filter_predictor sse4_1/;
   }
 }
 
@@ -131,6 +292,24 @@
   add_proto qw/void av1_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/av1_highbd_iht4x4_16_add/;
 
+  add_proto qw/void av1_highbd_iht4x8_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+    specialize qw/av1_highbd_iht4x8_32_add/;
+
+  add_proto qw/void av1_highbd_iht8x4_32_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+    specialize qw/av1_highbd_iht8x4_32_add/;
+
+  add_proto qw/void av1_highbd_iht8x16_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+    specialize qw/av1_highbd_iht8x16_128_add/;
+
+  add_proto qw/void av1_highbd_iht16x8_128_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+    specialize qw/av1_highbd_iht16x8_128_add/;
+
+  add_proto qw/void av1_highbd_iht16x32_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+    specialize qw/av1_highbd_iht16x32_512_add/;
+
+  add_proto qw/void av1_highbd_iht32x16_512_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+    specialize qw/av1_highbd_iht32x16_512_add/;
+
   add_proto qw/void av1_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
   specialize qw/av1_highbd_iht8x8_64_add/;
 
@@ -160,10 +339,10 @@
     specialize qw/av1_fdct8x8_quant/;
   } else {
     add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-    specialize qw/av1_block_error avx2 msa sse2/;
+    specialize qw/av1_block_error avx2 msa/, "$sse2_x86inc";
 
     add_proto qw/int64_t av1_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-    specialize qw/av1_block_error_fp neon sse2/;
+    specialize qw/av1_block_error_fp neon/, "$sse2_x86inc";
 
     add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t *iqm_ptr";
 
@@ -205,374 +384,82 @@
 
 }
 
-
 # fdct functions
 
-if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht4x4 sse2/;
+add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht4x4 sse2/;
 
-  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht8x8 sse2/;
+add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+specialize qw/av1_fwht4x4/;
 
-  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht16x16 sse2/;
+add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x8 sse2/;
 
-  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fwht4x4 sse2/;
-  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4/;
+add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x16 sse2 avx2/;
 
-    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4_1/;
+add_proto qw/void av1_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x32 avx2/;
 
-    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8/;
+if (aom_config("CONFIG_TX64X64") eq "yes") {
+  add_proto qw/void av1_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_fht64x64/;
+}
 
-    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8_1/;
+add_proto qw/void av1_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht4x8 sse2/;
 
-    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16/;
+add_proto qw/void av1_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x4 sse2/;
 
-    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16_1/;
+add_proto qw/void av1_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht8x16 sse2/;
 
-    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32/;
+add_proto qw/void av1_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x8 sse2/;
 
-    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_rd/;
+add_proto qw/void av1_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht16x32 sse2/;
 
-    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_1/;
+add_proto qw/void av1_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+specialize qw/av1_fht32x16 sse2/;
 
-    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct4x4/;
-
-    add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8/;
-
-    add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8_1/;
-
-    add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16/;
-
-    add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16_1/;
-
-    add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32/;
-
-    add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_rd/;
-
-    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_1/;
-  } else {
-    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4 sse2/;
-
-    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4_1 sse2/;
-
-    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8 sse2/;
-
-    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8_1 sse2/;
-
-    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16 sse2/;
-
-    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16_1 sse2/;
-
-    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32 sse2/;
-
-    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_rd sse2/;
-
-    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_1 sse2/;
-
-    add_proto qw/void av1_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct4x4 sse2/;
-
-    add_proto qw/void av1_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8 sse2/;
-
-    add_proto qw/void av1_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct8x8_1/;
-
-    add_proto qw/void av1_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16 sse2/;
-
-    add_proto qw/void av1_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct16x16_1/;
-
-    add_proto qw/void av1_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32 sse2/;
-
-    add_proto qw/void av1_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_rd sse2/;
-
-    add_proto qw/void av1_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_highbd_fdct32x32_1/;
-  }
-} else {
-  add_proto qw/void av1_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht4x4 sse2 msa/;
-
-  add_proto qw/void av1_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht8x8 sse2 msa/;
-
-  add_proto qw/void av1_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_fht16x16 sse2 msa/;
-
-  add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/av1_fwht4x4 msa sse2/;
-  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4/;
-
-    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4_1/;
-
-    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8/;
-
-    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8_1/;
-
-    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16/;
-
-    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16_1/;
-
-    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32/;
-
-    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_rd/;
-
-    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_1/;
-  } else {
-    add_proto qw/void av1_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4 sse2/;
-
-    add_proto qw/void av1_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct4x4_1 sse2/;
-
-    add_proto qw/void av1_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8 sse2/;
-
-    add_proto qw/void av1_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct8x8_1 sse2/;
-
-    add_proto qw/void av1_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16 sse2/;
-
-    add_proto qw/void av1_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct16x16_1 sse2/;
-
-    add_proto qw/void av1_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32 sse2/;
-
-    add_proto qw/void av1_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_rd sse2/;
-
-    add_proto qw/void av1_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/av1_fdct32x32_1 sse2/;
+if (aom_config("CONFIG_AOM_HIGHBITDEPTH") ne "yes") {
+  if (aom_config("CONFIG_EXT_TX") ne "yes") {
+    specialize qw/av1_fht4x4 msa/;
+    specialize qw/av1_fht8x8 msa/;
+    specialize qw/av1_fht16x16 msa/;
   }
 }
 
-# Inverse transform
+add_proto qw/void av1_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
+  specialize qw/av1_fwd_idtx/;
+
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
-  # Note as optimized versions of these functions are added we need to add a check to ensure
-  # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct4x4_1_add/;
+  #fwd txfm
+  add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+  add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
 
-  add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct4x4_16_add/;
-
-  add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct8x8_1_add/;
-
-  add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct8x8_64_add/;
-
-  add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct8x8_12_add/;
-
-  add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct16x16_1_add/;
-
-  add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct16x16_256_add/;
-
-  add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct16x16_10_add/;
-
-  add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct32x32_1024_add/;
-
-  add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct32x32_34_add/;
-
-  add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_idct32x32_1_add/;
-
-  add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_iwht4x4_1_add/;
-
-  add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/av1_iwht4x4_16_add/;
-
-  add_proto qw/void av1_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct4x4_1_add/;
-
-  add_proto qw/void av1_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct8x8_1_add/;
-
-  add_proto qw/void av1_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct16x16_1_add/;
-
-  add_proto qw/void av1_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct32x32_1024_add/;
-
-  add_proto qw/void av1_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct32x32_34_add/;
-
-  add_proto qw/void av1_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_idct32x32_1_add/;
-
-  add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_iwht4x4_1_add/;
-
-  add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/av1_highbd_iwht4x4_16_add/;
-
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct4x4_16_add/;
-
-    add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_64_add/;
-
-    add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_10_add/;
-
-    add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_256_add/;
-
-    add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_10_add/;
-  } else {
-    add_proto qw/void av1_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct4x4_16_add sse2/;
-
-    add_proto qw/void av1_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_64_add sse2/;
-
-    add_proto qw/void av1_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct8x8_10_add sse2/;
-
-    add_proto qw/void av1_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_256_add sse2/;
-
-    add_proto qw/void av1_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-    specialize qw/av1_highbd_idct16x16_10_add sse2/;
-  }  # CONFIG_EMULATE_HARDWARE
-} else {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (aom_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_1_add/;
-
-    add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_16_add/;
-
-    add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_1_add/;
-
-    add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_64_add/;
-
-    add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_12_add/;
-
-    add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_1_add/;
-
-    add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_256_add/;
-
-    add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_10_add/;
-
-    add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1024_add/;
-
-    add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_34_add/;
-
-    add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1_add/;
-
-    add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_1_add/;
-
-    add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_16_add/;
-  } else {
-    add_proto qw/void av1_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_1_add sse2/;
-
-    add_proto qw/void av1_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct4x4_16_add sse2/;
-
-    add_proto qw/void av1_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_1_add sse2/;
-
-    add_proto qw/void av1_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_64_add sse2/;
-
-    add_proto qw/void av1_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct8x8_12_add sse2/;
-
-    add_proto qw/void av1_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_1_add sse2/;
-
-    add_proto qw/void av1_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_256_add sse2/;
-
-    add_proto qw/void av1_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct16x16_10_add sse2/;
-
-    add_proto qw/void av1_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1024_add sse2/;
-
-    add_proto qw/void av1_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_34_add sse2/;
-
-    add_proto qw/void av1_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_idct32x32_1_add sse2/;
-
-    add_proto qw/void av1_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_1_add/;
-
-    add_proto qw/void av1_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/av1_iwht4x4_16_add/;
-  }  # CONFIG_EMULATE_HARDWARE
-}  # CONFIG_AOM_HIGHBITDEPTH
+  #inv txfm
+  add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
+  add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
+  add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_16x16 sse4_1/;
+  add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_32x32/;
+  add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/av1_inv_txfm2d_add_64x64/;
+}
 
 #
 # Motion search
@@ -582,7 +469,7 @@
 $av1_full_search_sad_sse3=av1_full_search_sadx3;
 $av1_full_search_sad_sse4_1=av1_full_search_sadx8;
 
-add_proto qw/int av1_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/av1_diamond_search_sad/;
 
 add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const struct mv *center_mv";
@@ -594,26 +481,61 @@
 if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
+  if (aom_config("CONFIG_NEW_QUANT") eq "yes") {
+    add_proto qw/void highbd_quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_nuq/;
+
+    add_proto qw/void highbd_quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_fp_nuq/;
+
+    add_proto qw/void highbd_quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_32x32_nuq/;
+
+    add_proto qw/void highbd_quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_32x32_fp_nuq/;
+  }
 
   add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/av1_highbd_block_error sse2/;
 
   if (aom_config("CONFIG_AOM_QM") eq "yes") {
-    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
-
-    add_proto qw/void av1_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr";
-  } else {
-    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
     specialize qw/av1_highbd_quantize_fp/;
 
-    add_proto qw/void av1_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void av1_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
     specialize qw/av1_highbd_quantize_fp_32x32/;
 
+    add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+    specialize qw/av1_highbd_quantize_b/;
+  } else {
+    add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/av1_highbd_quantize_fp sse4_1/;
+
+    add_proto qw/void av1_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+    specialize qw/av1_highbd_quantize_b/;
   }
 
   # fdct functions
   add_proto qw/void av1_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/av1_highbd_fht4x4/;
+  specialize qw/av1_highbd_fht4x4 sse4_1/;
+
+  add_proto qw/void av1_highbd_fht4x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht4x8/;
+
+  add_proto qw/void av1_highbd_fht8x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht8x4/;
+
+  add_proto qw/void av1_highbd_fht8x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht8x16/;
+
+  add_proto qw/void av1_highbd_fht16x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht16x8/;
+
+  add_proto qw/void av1_highbd_fht16x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht16x32/;
+
+  add_proto qw/void av1_highbd_fht32x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht32x16/;
 
   add_proto qw/void av1_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_highbd_fht8x8/;
@@ -621,6 +543,14 @@
   add_proto qw/void av1_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/av1_highbd_fht16x16/;
 
+  add_proto qw/void av1_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/av1_highbd_fht32x32/;
+
+  if (aom_config("CONFIG_TX64X64") eq "yes") {
+    add_proto qw/void av1_highbd_fht64x64/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+    specialize qw/av1_highbd_fht64x64/;
+  }
+
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/av1_highbd_fwht4x4/;
 
@@ -630,6 +560,15 @@
 }
 # End av1_high encoder functions
 
+if (aom_config("CONFIG_EXT_INTER") eq "yes") {
+  add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
+  specialize qw/av1_wedge_sse_from_residuals sse2/;
+  add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+  specialize qw/av1_wedge_sign_from_residuals sse2/;
+  add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
+  specialize qw/av1_wedge_compute_delta_squares sse2/;
+}
+
 }
 # end encoder functions
 

diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
new file mode 100644
index 0000000..3b78981
--- /dev/null
+++ b/av1/common/av1_txfm.h

@@ -0,0 +1,208 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef AV1_TXFM_H_
+#define AV1_TXFM_H_
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "av1/common/enums.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static const int cos_bit_min = 10;
+static const int cos_bit_max = 16;
+
+// cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+static const int32_t cospi_arr[7][64] = {
+  { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+    972,  964,  955,  946,  936,  926,  915,  903,  891,  878, 865, 851, 837,
+    822,  807,  792,  775,  759,  742,  724,  706,  688,  669, 650, 630, 610,
+    590,  569,  548,  526,  505,  483,  460,  438,  415,  392, 369, 345, 321,
+    297,  273,  249,  224,  200,  175,  150,  125,  100,  75,  50,  25 },
+  { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+    1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+    1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+    1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+    965,  921,  876,  830,  784,  737,  690,  642,  595,  546,  498,
+    449,  400,  350,  301,  251,  201,  151,  100,  50 },
+  { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+    3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+    3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+    2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+    1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+    897,  799,  700,  601,  501,  401,  301,  201,  101 },
+  { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+    7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+    7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+    5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+    3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+    1795, 1598, 1401, 1202, 1003, 803,  603,  402,  201 },
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
+    15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
+    14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
+    11297, 11003, 10702, 10394, 10080, 9760,  9434,  9102,  8765,  8423,  8076,
+    7723,  7366,  7005,  6639,  6270,  5897,  5520,  5139,  4756,  4370,  3981,
+    3590,  3196,  2801,  2404,  2006,  1606,  1205,  804,   402 },
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
+    31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
+    28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
+    22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
+    15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512,  8740,  7962,
+    7180,  6393,  5602,  4808,  4011,  3212,  2411,  1608,  804 },
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
+    63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
+    56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
+    45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
+    30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
+    14359, 12785, 11204, 9616,  8022,  6424,  4821,  3216,  1608 }
+};
+
+static INLINE int32_t round_shift(int32_t value, int bit) {
+  return (value + (1 << (bit - 1))) >> bit;
+}
+
+static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = arr[i] << (-bit);
+      }
+    }
+  }
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+                               int bit) {
+  int32_t result_32 = w0 * in0 + w1 * in1;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
+  if (result_32 != result_64) {
+    printf("%s %d overflow result_32: %d result_64: %" PRId64
+           " w0: %d in0: %d w1: %d in1: "
+           "%d\n",
+           __FILE__, __LINE__, result_32, result_64, w0, in0, w1, in1);
+    assert(0 && "half_btf overflow");
+  }
+#endif
+  return round_shift(result_32, bit);
+}
+
+static INLINE int get_max_bit(int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+// TODO(angiebird): implement SSE
+static INLINE void clamp_block(int16_t *block, int block_size, int stride,
+                               int low, int high) {
+  int i, j;
+  for (i = 0; i < block_size; ++i) {
+    for (j = 0; j < block_size; ++j) {
+      block[i * stride + j] = clamp(block[i * stride + j], low, high);
+    }
+  }
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range);
+
+typedef enum TXFM_TYPE {
+  TXFM_TYPE_DCT4,
+  TXFM_TYPE_DCT8,
+  TXFM_TYPE_DCT16,
+  TXFM_TYPE_DCT32,
+  TXFM_TYPE_DCT64,
+  TXFM_TYPE_ADST4,
+  TXFM_TYPE_ADST8,
+  TXFM_TYPE_ADST16,
+  TXFM_TYPE_ADST32,
+} TXFM_TYPE;
+
+typedef struct TXFM_2D_CFG {
+  const int txfm_size;
+  const int stage_num_col;
+  const int stage_num_row;
+
+  const int8_t *shift;
+  const int8_t *stage_range_col;
+  const int8_t *stage_range_row;
+  const int8_t *cos_bit_col;
+  const int8_t *cos_bit_row;
+  const TXFM_TYPE txfm_type_col;
+  const TXFM_TYPE txfm_type_row;
+} TXFM_2D_CFG;
+
+typedef struct TXFM_2D_FLIP_CFG {
+  int ud_flip;  // flip upside down
+  int lr_flip;  // flip left to right
+  const TXFM_2D_CFG *cfg;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void set_flip_cfg(int tx_type, TXFM_2D_FLIP_CFG *cfg) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+    case DCT_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_FLIPADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 1;
+      break;
+    case ADST_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_ADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      assert(0);
+  }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_cfg(int tx_type, int tx_size);
+TXFM_2D_FLIP_CFG av1_get_fwd_txfm_64x64_cfg(int tx_type);
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // AV1_TXFM_H_

diff --git a/av1/common/blockd.c b/av1/common/blockd.c
index 932bdf2..8cfd223 100644
--- a/av1/common/blockd.c
+++ b/av1/common/blockd.c

@@ -9,6 +9,10 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <math.h>
+
+#include "aom_ports/system_state.h"
+
 #include "av1/common/blockd.h"
 
 PREDICTION_MODE av1_left_block_mode(const MODE_INFO *cur_mi,
@@ -45,30 +49,35 @@
   // transform size varies per plane, look it up in a common way.
   const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int step = 1 << (tx_size_1d_in_unit_log2[tx_size] * 2);
+  const int num_4x4_w = block_size_wide[plane_bsize];
+  const int num_4x4_h = block_size_high[plane_bsize];
+  const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+  const uint8_t txh_unit = tx_size_high_unit[tx_size];
+  const int step = txw_unit * txh_unit;
   int i = 0, r, c;
 
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
-  const int max_blocks_wide =
+  int max_blocks_wide =
       num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
-                                                       (5 + pd->subsampling_x));
-  const int max_blocks_high =
+                                                       (3 + pd->subsampling_x));
+  int max_blocks_high =
       num_4x4_h + (xd->mb_to_bottom_edge >= 0
                        ? 0
-                       : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+                       : xd->mb_to_bottom_edge >> (3 + pd->subsampling_y));
   const int extra_step =
-      ((num_4x4_w - max_blocks_wide) >> tx_size_1d_in_unit_log2[tx_size]) *
-      step;
+      ((num_4x4_w - max_blocks_wide) >> tx_size_wide_log2[tx_size]) * step;
+
+  // Scale to the transform block unit.
+  max_blocks_wide >>= tx_size_wide_log2[0];
+  max_blocks_high >>= tx_size_high_log2[0];
 
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
-  for (r = 0; r < max_blocks_high; r += tx_size_1d_in_unit[tx_size]) {
+  for (r = 0; r < max_blocks_high; r += txh_unit) {
     // Skip visiting the sub blocks that are wholly within the UMV.
-    for (c = 0; c < max_blocks_wide; c += tx_size_1d_in_unit[tx_size]) {
+    for (c = 0; c < max_blocks_wide; c += txw_unit) {
       visit(plane, i, r, c, plane_bsize, tx_size, arg);
       i += step;
     }
@@ -81,7 +90,6 @@
                                    foreach_transformed_block_visitor visit,
                                    void *arg) {
   int plane;
-
   for (plane = 0; plane < MAX_MB_PLANE; ++plane)
     av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
@@ -91,21 +99,22 @@
                       TX_SIZE tx_size, int has_eob, int aoff, int loff) {
   ENTROPY_CONTEXT *const a = pd->above_context + aoff;
   ENTROPY_CONTEXT *const l = pd->left_context + loff;
-  const int tx_size_in_blocks = tx_size_1d_in_unit[tx_size];
+  const int txs_wide = tx_size_wide_unit[tx_size];
+  const int txs_high = tx_size_high_unit[tx_size];
 
   // above
   if (has_eob && xd->mb_to_right_edge < 0) {
     int i;
     const int blocks_wide =
         pd->n4_w + (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-    int above_contexts = tx_size_in_blocks;
+    int above_contexts = txs_wide;
     if (above_contexts + aoff > blocks_wide)
       above_contexts = blocks_wide - aoff;
 
     for (i = 0; i < above_contexts; ++i) a[i] = has_eob;
-    for (i = above_contexts; i < tx_size_in_blocks; ++i) a[i] = 0;
+    for (i = above_contexts; i < txs_wide; ++i) a[i] = 0;
   } else {
-    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * txs_wide);
   }
 
   // left
@@ -113,13 +122,13 @@
     int i;
     const int blocks_high =
         pd->n4_h + (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-    int left_contexts = tx_size_in_blocks;
+    int left_contexts = txs_high;
     if (left_contexts + loff > blocks_high) left_contexts = blocks_high - loff;
 
     for (i = 0; i < left_contexts; ++i) l[i] = has_eob;
-    for (i = left_contexts; i < tx_size_in_blocks; ++i) l[i] = 0;
+    for (i = left_contexts; i < txs_high; ++i) l[i] = 0;
   } else {
-    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * txs_high);
   }
 }
 #endif
@@ -133,3 +142,27 @@
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
 }
+
+#if CONFIG_EXT_INTRA
+const int16_t dr_intra_derivative[90] = {
+  1,    14666, 7330, 4884, 3660, 2926, 2435, 2084, 1821, 1616, 1451, 1317, 1204,
+  1108, 1026,  955,  892,  837,  787,  743,  703,  666,  633,  603,  574,  548,
+  524,  502,   481,  461,  443,  426,  409,  394,  379,  365,  352,  339,  327,
+  316,  305,   294,  284,  274,  265,  256,  247,  238,  230,  222,  214,  207,
+  200,  192,   185,  179,  172,  166,  159,  153,  147,  141,  136,  130,  124,
+  119,  113,   108,  103,  98,   93,   88,   83,   78,   73,   68,   63,   59,
+  54,   49,    45,   40,   35,   31,   26,   22,   17,   13,   8,    4,
+};
+
+int av1_is_intra_filter_switchable(int angle) {
+  assert(angle > 0 && angle < 270);
+  if (angle % 45 == 0) return 0;
+  if (angle > 90 && angle < 180) {
+    return 1;
+  } else {
+    return ((angle < 90 ? dr_intra_derivative[angle]
+                        : dr_intra_derivative[270 - angle]) &
+            0xFF) > 0;
+  }
+}
+#endif  // CONFIG_EXT_INTRA

diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 7c69ade..31836fb 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h

@@ -19,12 +19,10 @@
 #include "aom_scale/yv12config.h"
 
 #include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/mv.h"
-#if CONFIG_AOM_QM
-#include "av1/common/quant_common.h"
-#endif
 #include "av1/common/scale.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
@@ -40,56 +38,24 @@
 
 #define MAX_MB_PLANE 3
 
-#if CONFIG_EXT_INTRA
-#define MAX_ANGLE_DELTA 3
-#define MAX_ANGLE_DELTA_UV 2
-#define ANGLE_STEP_UV 4
-
-static const uint8_t av1_angle_step_y[TX_SIZES][INTRA_MODES] = {
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      0, 4, 4, 4, 4, 4, 4, 4, 4, 0,
-  },
-  {
-      0, 3, 3, 3, 3, 3, 3, 3, 3, 0,
-  },
-  {
-      0, 3, 3, 3, 3, 3, 3, 3, 3, 0,
-  },
-};
-static const uint8_t av1_max_angle_delta_y[TX_SIZES][INTRA_MODES] = {
-  {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-  },
-  {
-      0, 2, 2, 2, 2, 2, 2, 2, 2, 0,
-  },
-  {
-      0, 3, 3, 3, 3, 3, 3, 3, 3, 0,
-  },
-  {
-      0, 3, 3, 3, 3, 3, 3, 3, 3, 0,
-  },
-};
-static const uint8_t mode_to_angle_map[INTRA_MODES] = {
-  0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
-};
-
-static INLINE int is_directional_mode(PREDICTION_MODE mode) {
-  return (mode < TM_PRED && mode != DC_PRED);
-}
-#endif  // CONFIG_EXT_INTRA
-
 typedef enum {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
   FRAME_TYPES,
 } FRAME_TYPE;
 
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+#define IsInterpolatingFilter(filter) (av1_is_interpolating_filter(filter))
+#else
+#define IsInterpolatingFilter(filter) (1)
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+#if CONFIG_EXT_INTER
+  return mode >= NEARESTMV && mode <= NEW_NEWMV;
+#else
   return mode >= NEARESTMV && mode <= NEWMV;
+#endif  // CONFIG_EXT_INTER
 }
 
 #if CONFIG_PVQ
@@ -119,6 +85,91 @@
 } PVQ_QUEUE;
 #endif
 
+#if CONFIG_EXT_INTER
+static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+  return mode >= NEARESTMV && mode <= NEWFROMNEARMV;
+}
+
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+  return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+}
+
+static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[MB_MODE_COUNT] = {
+    MB_MODE_COUNT,  // DC_PRED            0
+    MB_MODE_COUNT,  // V_PRED             1
+    MB_MODE_COUNT,  // H_PRED             2
+    MB_MODE_COUNT,  // D45_PRED           3
+    MB_MODE_COUNT,  // D135_PRED          4
+    MB_MODE_COUNT,  // D117_PRED          5
+    MB_MODE_COUNT,  // D153_PRED          6
+    MB_MODE_COUNT,  // D207_PRED          7
+    MB_MODE_COUNT,  // D63_PRED           8
+    MB_MODE_COUNT,  // TM_PRED            9
+    MB_MODE_COUNT,  // NEARESTMV         10
+    MB_MODE_COUNT,  // NEARMV            11
+    MB_MODE_COUNT,  // ZEROMV            12
+    MB_MODE_COUNT,  // NEWMV             13
+    MB_MODE_COUNT,  // NEWFROMNEARMV     14
+    NEARESTMV,      // NEAREST_NEARESTMV 15
+    NEARESTMV,      // NEAREST_NEARMV    16
+    NEARMV,         // NEAR_NEARESTMV    17
+    NEARMV,         // NEAR_NEARMV       18
+    NEARESTMV,      // NEAREST_NEWMV     19
+    NEWMV,          // NEW_NEARESTMV     20
+    NEARMV,         // NEAR_NEWMV        21
+    NEWMV,          // NEW_NEARMV        22
+    ZEROMV,         // ZERO_ZEROMV       23
+    NEWMV,          // NEW_NEWMV         24
+  };
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[MB_MODE_COUNT] = {
+    MB_MODE_COUNT,  // DC_PRED            0
+    MB_MODE_COUNT,  // V_PRED             1
+    MB_MODE_COUNT,  // H_PRED             2
+    MB_MODE_COUNT,  // D45_PRED           3
+    MB_MODE_COUNT,  // D135_PRED          4
+    MB_MODE_COUNT,  // D117_PRED          5
+    MB_MODE_COUNT,  // D153_PRED          6
+    MB_MODE_COUNT,  // D207_PRED          7
+    MB_MODE_COUNT,  // D63_PRED           8
+    MB_MODE_COUNT,  // TM_PRED            9
+    MB_MODE_COUNT,  // NEARESTMV         10
+    MB_MODE_COUNT,  // NEARMV            11
+    MB_MODE_COUNT,  // ZEROMV            12
+    MB_MODE_COUNT,  // NEWMV             13
+    MB_MODE_COUNT,  // NEWFROMNEARMV     14
+    NEARESTMV,      // NEAREST_NEARESTMV 15
+    NEARMV,         // NEAREST_NEARMV    16
+    NEARESTMV,      // NEAR_NEARESTMV    17
+    NEARMV,         // NEAR_NEARMV       18
+    NEWMV,          // NEAREST_NEWMV     19
+    NEARESTMV,      // NEW_NEARESTMV     20
+    NEWMV,          // NEAR_NEWMV        21
+    NEARMV,         // NEW_NEARMV        22
+    ZEROMV,         // ZERO_ZEROMV       23
+    NEWMV,          // NEW_NEWMV         24
+  };
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV || mode == NEWFROMNEARMV || mode == NEW_NEWMV ||
+          mode == NEAREST_NEWMV || mode == NEW_NEARESTMV ||
+          mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+#else
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV);
+}
+#endif  // CONFIG_EXT_INTER
+
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -129,6 +180,9 @@
 #if CONFIG_REF_MV
   int_mv pred_mv[2];
 #endif
+#if CONFIG_EXT_INTER
+  int_mv ref_mv[2];
+#endif  // CONFIG_EXT_INTER
 } b_mode_info;
 
 typedef int8_t MV_REFERENCE_FRAME;
@@ -149,11 +203,30 @@
 } PALETTE_MODE_INFO;
 #endif  // CONFIG_PALETTE
 
-#if CONFIG_REF_MV
-#define MODE_CTX_REF_FRAMES (MAX_REF_FRAMES + COMP_REFS)
-#else
-#define MODE_CTX_REF_FRAMES MAX_REF_FRAMES
+#if CONFIG_FILTER_INTRA
+typedef struct {
+  // 1: an ext intra mode is used; 0: otherwise.
+  uint8_t use_filter_intra_mode[PLANE_TYPES];
+  FILTER_INTRA_MODE filter_intra_mode[PLANE_TYPES];
+} FILTER_INTRA_MODE_INFO;
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_VAR_TX
+#define TXB_COEFF_COST_MAP_SIZE (2 * MAX_MIB_SIZE)
+
+// TODO(angiebird): Merge RD_COST and RD_STATS
+typedef struct RD_STATS {
+  int rate;
+  int64_t dist;
+  int64_t sse;
+  int skip;
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost[MAX_MB_PLANE];
+  int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+                        [TXB_COEFF_COST_MAP_SIZE];
 #endif
+} RD_STATS;
+#endif  // CONFIG_VAR_TX
 
 // This structure now relates to 8x8 block regions.
 typedef struct {
@@ -161,9 +234,18 @@
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
+#if CONFIG_VAR_TX
+  // TODO(jingning): This effectively assigned a separate entry for each
+  // 8x8 block. Apparently it takes much more space than needed.
+  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  TX_SIZE min_tx_size;
+#endif
   int8_t skip;
-  int8_t has_no_coeffs;
   int8_t segment_id;
+#if CONFIG_SUPERTX
+  // Minimum of all segment IDs under the current supertx block.
+  int8_t segment_id_supertx;
+#endif                      // CONFIG_SUPERTX
   int8_t seg_id_predicted;  // valid only when temporal_update is enabled
 
   // Only for INTRA blocks
@@ -172,30 +254,57 @@
   PALETTE_MODE_INFO palette_mode_info;
 #endif  // CONFIG_PALETTE
 
-#if CONFIG_EXT_INTRA
-  // The actual prediction angle is the base angle + (angle_delta * step).
-  int8_t intra_angle_delta[2];
-#endif  // CONFIG_EXT_INTRA
-
-  // Only for INTER blocks
+// Only for INTER blocks
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4];
+#else
   InterpFilter interp_filter;
+#endif
   MV_REFERENCE_FRAME ref_frame[2];
-#if CONFIG_MOTION_VAR
-  MOTION_MODE motion_mode;
-#endif  // CONFIG_MOTION_VAR
   TX_TYPE tx_type;
 
-#if CONFIG_REF_MV
+#if CONFIG_FILTER_INTRA
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+  int8_t angle_delta[2];
+  // To-Do (huisu): this may be replaced by interp_filter
+  INTRA_FILTER intra_filter;
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTER
+  INTERINTRA_MODE interintra_mode;
+  // TODO(debargha): Consolidate these flags
+  int use_wedge_interintra;
+  int interintra_wedge_index;
+  int interintra_wedge_sign;
+  int use_wedge_interinter;
+  int interinter_wedge_index;
+  int interinter_wedge_sign;
+#endif  // CONFIG_EXT_INTER
+  MOTION_MODE motion_mode;
+  int_mv mv[2];
   int_mv pred_mv[2];
+#if CONFIG_REF_MV
   uint8_t ref_mv_idx;
 #endif
-  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
-  int_mv mv[2];
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
+#if CONFIG_NEW_QUANT
+  int dq_off_index;
+  int send_dq_bit;
+#endif  // CONFIG_NEW_QUANT
   /* deringing gain *per-superblock* */
   int8_t dering_gain;
 #if CONFIG_DELTA_Q
   int current_q_index;
 #endif
+#if CONFIG_RD_DEBUG
+  RD_STATS rd_stats;
+  int mi_row;
+  int mi_col;
+#endif
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
@@ -225,10 +334,13 @@
 
 struct buf_2d {
   uint8_t *buf;
+  uint8_t *buf0;
+  int width;
+  int height;
   int stride;
 };
 
-struct macroblockd_plane {
+typedef struct macroblockd_plane {
   tran_low_t *dqcoeff;
   PLANE_TYPE plane_type;
   int subsampling_x;
@@ -238,6 +350,10 @@
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
   int16_t seg_dequant[MAX_SEGMENTS][2];
+#if CONFIG_NEW_QUANT
+  dequant_val_type_nuq seg_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES]
+                                      [COEF_BANDS];
+#endif
 #if CONFIG_PALETTE
   uint8_t *color_index_map;
 #endif  // CONFIG_PALETTE
@@ -246,12 +362,17 @@
   uint16_t n4_w, n4_h;
   // log2 of n4_w, n4_h
   uint8_t n4_wl, n4_hl;
+  // block size in pixels
+  uint8_t width, height;
 
 #if CONFIG_AOM_QM
   const qm_val_t *seg_iqmatrix[MAX_SEGMENTS][2][TX_SIZES];
 #endif
   // encoder
   const int16_t *dequant;
+#if CONFIG_NEW_QUANT
+  const dequant_val_type_nuq *dequant_val_nuq[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
 #if CONFIG_AOM_QM
   const qm_val_t *seg_qmatrix[MAX_SEGMENTS][2][TX_SIZES];
 #endif
@@ -261,7 +382,7 @@
   // PVQ: forward transformed predicted image, a reference for PVQ.
   tran_low_t *pvq_ref_coeff;
 #endif
-};
+} MACROBLOCKD_PLANE;
 
 #define BLOCK_OFFSET(x, i) ((x) + (i)*16)
 
@@ -292,14 +413,14 @@
   int up_available;
   int left_available;
 
+  const aom_prob (*partition_probs)[PARTITION_TYPES - 1];
+
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
   int mb_to_right_edge;
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
-  uint8_t n8_w, n8_h;
-
   FRAME_CONTEXT *fc;
 
   /* pointers to reference frames */
@@ -308,18 +429,32 @@
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
+
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT *left_txfm_context;
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+
+  TX_SIZE max_tx_size;
+#if CONFIG_SUPERTX
+  TX_SIZE supertx_size;
+#endif
+#endif
+
+  // dimension in the unit of 8x8 block of the current block
+  uint8_t n8_w, n8_h;
+
 #if CONFIG_REF_MV
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
   uint8_t is_sec_rect;
 #endif
 
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
-
 #if CONFIG_PVQ
   daala_dec_ctx daala_dec;
 #endif
@@ -328,10 +463,14 @@
   int bd;
 #endif
 
+  int qindex[MAX_SEGMENTS];
   int lossless[MAX_SEGMENTS];
   int corrupted;
 
   struct aom_internal_error_info *error_info;
+#if CONFIG_GLOBAL_MOTION
+  Global_Motion_Params *global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_DELTA_Q
   int prev_qindex;
   int delta_qindex;
@@ -341,7 +480,10 @@
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                      PARTITION_TYPE partition) {
-  return subsize_lookup[partition][bsize];
+  if (partition == PARTITION_INVALID)
+    return BLOCK_INVALID;
+  else
+    return subsize_lookup[partition][bsize];
 }
 
 static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
@@ -357,35 +499,291 @@
   ADST_ADST,  // TM
 };
 
+#if CONFIG_SUPERTX
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  return (int)txsize_sqr_map[mbmi->tx_size] >
+         AOMMIN(b_width_log2_lookup[mbmi->sb_type],
+                b_height_log2_lookup[mbmi->sb_type]);
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_TX
+#define ALLOW_INTRA_EXT_TX 1
+
+static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = { 1, 16, 12, 2 };
+static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = { 1, 7, 5 };
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs, int is_inter) {
+  tx_size = txsize_sqr_map[tx_size];
+  if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
+  if (tx_size == TX_32X32) return is_inter ? 3 : 0;
+  return (tx_size == TX_16X16 ? 2 : 1);
+}
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][TX_SIZES] = {
+  { 0, 0, 0, 0 },  // unused
+  { 1, 1, 0, 0 },
+  { 0, 0, 1, 0 },
+};
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = {
+  { 0, 0, 0, 0 },  // unused
+  { 1, 1, 0, 0 },
+  { 0, 0, 1, 0 },
+  { 0, 0, 0, 1 },
+};
+
+// Transform types used in each intra set
+static const int ext_tx_used_intra[EXT_TX_SETS_INTRA][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+};
+
+// Transform types used in each inter set
+static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+  { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+  { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+};
+
+// 1D Transforms used in inter set, this needs to be changed if
+// ext_tx_used_inter is changed
+static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
+  { 1, 0, 0, 0 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, { 1, 0, 0, 1 },
+};
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                   int is_inter) {
+  const int set = get_ext_tx_set(tx_size, bs, is_inter);
+  return is_inter ? num_ext_tx_set_inter[set] : num_ext_tx_set_intra[set];
+}
+
+#if CONFIG_RECT_TX
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+  static const char LUT[BLOCK_SIZES] = {
+    0,  // BLOCK_4X4
+    1,  // BLOCK_4X8
+    1,  // BLOCK_8X4
+    0,  // BLOCK_8X8
+    1,  // BLOCK_8X16
+    1,  // BLOCK_16X8
+    0,  // BLOCK_16X16
+    1,  // BLOCK_16X32
+    1,  // BLOCK_32X16
+    0,  // BLOCK_32X32
+    0,  // BLOCK_32X64
+    0,  // BLOCK_64X32
+    0,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+    0,  // BLOCK_64X128
+    0,  // BLOCK_128X64
+    0,  // BLOCK_128X128
+#endif  // CONFIG_EXT_PARTITION
+  };
+
+  return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  return is_inter_block(mbmi) && is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+         !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+#endif  // CONFIG_RECT_TX
+#endif  // CONFIG_EXT_TX
+
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode,
+                                           int is_inter) {
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (!is_inter) {
+    return AOMMIN(max_tx_size, largest_tx_size);
+  } else {
+    const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
+    if (txsize_sqr_up_map[max_rect_tx_size] <= largest_tx_size) {
+      return max_rect_tx_size;
+    } else {
+      return largest_tx_size;
+    }
+  }
+#else
+  (void)is_inter;
+  return AOMMIN(max_tx_size, largest_tx_size);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+}
+
+#if CONFIG_FILTER_INTRA
+static const TX_TYPE filter_intra_mode_to_tx_type_lookup[FILTER_INTRA_MODES] = {
+  DCT_DCT,    // FILTER_DC
+  ADST_DCT,   // FILTER_V
+  DCT_ADST,   // FILTER_H
+  DCT_DCT,    // FILTER_D45
+  ADST_ADST,  // FILTER_D135
+  ADST_DCT,   // FILTER_D117
+  DCT_ADST,   // FILTER_D153
+  DCT_ADST,   // FILTER_D207
+  ADST_DCT,   // FILTER_D63
+  ADST_ADST,  // FILTER_TM
+};
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+#define ANGLE_STEP 3
+#define MAX_ANGLE_DELTAS 3
+extern const int16_t dr_intra_derivative[90];
+static const uint8_t mode_to_angle_map[INTRA_MODES] = {
+  0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
+};
+
+// Returns whether filter selection is needed for a given
+// intra prediction angle.
+int av1_is_intra_filter_switchable(int angle);
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_TILE
+#define FIXED_TX_TYPE 1
+#else
+#define FIXED_TX_TYPE 0
+#endif
+
+static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+                                          const MACROBLOCKD *xd, int block_idx,
+                                          TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+    return DCT_DCT;
+
+  return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
+                                           ? get_y_mode(xd->mi[0], block_idx)
+                                           : mbmi->uv_mode];
+}
+
 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd,
-                                  int block_idx) {
+                                  int block_idx, TX_SIZE tx_size) {
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
+  if (FIXED_TX_TYPE)
+    return get_default_tx_type(plane_type, xd, block_idx, tx_size);
+
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+  if (!is_inter_block(mbmi)) {
+#if CONFIG_FILTER_INTRA
+    const int use_filter_intra_mode_info =
+        mbmi->filter_intra_mode_info.use_filter_intra_mode[plane_type];
+    const FILTER_INTRA_MODE filter_intra_mode =
+        mbmi->filter_intra_mode_info.filter_intra_mode[plane_type];
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+    const PREDICTION_MODE mode = (plane_type == PLANE_TYPE_Y)
+                                     ? get_y_mode(mi, block_idx)
+                                     : mbmi->uv_mode;
+#endif  // CONFIG_EXT_INTRA
+
+    if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32) return DCT_DCT;
+
+#if CONFIG_EXT_TX
+#if ALLOW_INTRA_EXT_TX
+    if (mbmi->sb_type >= BLOCK_8X8 && plane_type == PLANE_TYPE_Y)
+      return mbmi->tx_type;
+#endif  // ALLOW_INTRA_EXT_TX
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_FILTER_INTRA
+    if (use_filter_intra_mode_info)
+      return filter_intra_mode_to_tx_type_lookup[filter_intra_mode];
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+    if (mode == DC_PRED) {
+      return DCT_DCT;
+    } else if (mode == TM_PRED) {
+      return ADST_ADST;
+    } else {
+      int angle = mode_to_angle_map[mode];
+      if (mbmi->sb_type >= BLOCK_8X8)
+        angle += mbmi->angle_delta[plane_type] * ANGLE_STEP;
+      assert(angle > 0 && angle < 270);
+      if (angle == 135)
+        return ADST_ADST;
+      else if (angle < 45 || angle > 225)
+        return DCT_DCT;
+      else if (angle < 135)
+        return ADST_DCT;
+      else
+        return DCT_ADST;
+    }
+#endif  // CONFIG_EXT_INTRA
+  }
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_TX
+#if EXT_TX_SIZES == 4
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] > TX_32X32 ||
+      (txsize_sqr_map[tx_size] >= TX_32X32 && !is_inter_block(mbmi)))
+#else
+  if (xd->lossless[mbmi->segment_id] || txsize_sqr_map[tx_size] >= TX_32X32)
+#endif
+    return DCT_DCT;
+  if (mbmi->sb_type >= BLOCK_8X8) {
+    if (plane_type == PLANE_TYPE_Y) {
+#if !ALLOW_INTRA_EXT_TX
+      if (is_inter_block(mbmi))
+#endif  // ALLOW_INTRA_EXT_TX
+        return mbmi->tx_type;
+    }
+    if (is_inter_block(mbmi))
+      // UV Inter only
+      return (mbmi->tx_type == IDTX && txsize_sqr_map[tx_size] == TX_32X32)
+                 ? DCT_DCT
+                 : mbmi->tx_type;
+  }
+
+  // Sub8x8-Inter/Intra OR UV-Intra
+  if (is_inter_block(mbmi))  // Sub8x8-Inter
+    return DCT_DCT;
+  else  // Sub8x8 Intra OR UV-Intra
+    return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y
+                                             ? get_y_mode(mi, block_idx)
+                                             : mbmi->uv_mode];
+#else   // CONFIG_EXT_TX
   (void)block_idx;
   if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      mbmi->tx_size >= TX_32X32)
+      txsize_sqr_map[tx_size] >= TX_32X32)
     return DCT_DCT;
-
   return mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
 }
 
 void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
 
-static INLINE TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize,
-                                          int xss, int yss) {
-  if (bsize < BLOCK_8X8) {
-    return TX_4X4;
-  } else {
-    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][xss][yss];
-    return AOMMIN(y_tx_size, max_txsize_lookup[plane_bsize]);
-  }
+static INLINE int tx_size_to_depth(const TX_SIZE tx_size) {
+  return (int)(tx_size - TX_4X4);
+}
+
+static INLINE TX_SIZE depth_to_tx_size(const int depth) {
+  return (TX_SIZE)(depth + TX_4X4);
 }
 
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                      const struct macroblockd_plane *pd) {
-  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
-                             pd->subsampling_y);
+  TX_SIZE uv_txsize;
+#if CONFIG_SUPERTX
+  if (supertx_enabled(mbmi))
+    return uvsupertx_size_lookup[txsize_sqr_map[mbmi->tx_size]]
+                                [pd->subsampling_x][pd->subsampling_y];
+#endif  // CONFIG_SUPERTX
+  uv_txsize = uv_txsize_lookup[mbmi->sb_type][mbmi->tx_size][pd->subsampling_x]
+                              [pd->subsampling_y];
+  assert(uv_txsize != TX_INVALID);
+  return uv_txsize;
 }
 
 static INLINE BLOCK_SIZE
@@ -405,20 +803,6 @@
   }
 }
 
-#if CONFIG_MOTION_VAR
-static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
-  return (bsize >= BLOCK_8X8);
-}
-
-static INLINE int is_motion_variation_allowed(const MB_MODE_INFO *mbmi) {
-  return is_motion_variation_allowed_bsize(mbmi->sb_type);
-}
-
-static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
-  return (is_inter_block(mbmi));
-}
-#endif  // CONFIG_MOTION_VAR
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   int blk_row, int blk_col,
                                                   BLOCK_SIZE plane_bsize,
@@ -436,6 +820,63 @@
 void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       TX_SIZE tx_size, int has_eob, int aoff, int loff);
 
+#if CONFIG_EXT_INTER
+static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+  // TODO(debargha): Should this be bsize < BLOCK_LARGEST?
+  return (bsize >= BLOCK_8X8) && (bsize < BLOCK_64X64);
+}
+
+static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+  return (mode >= NEARESTMV) && (mode <= NEWMV);
+}
+
+static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+  return is_interintra_allowed_bsize(mbmi->sb_type) &&
+         is_interintra_allowed_mode(mbmi->mode) &&
+         is_interintra_allowed_ref(mbmi->ref_frame);
+}
+
+static INLINE int is_interintra_allowed_bsize_group(const int group) {
+  int i;
+  for (i = 0; i < BLOCK_SIZES; i++) {
+    if (size_group_lookup[i] == group &&
+        is_interintra_allowed_bsize((BLOCK_SIZE)i)) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+  return (mbmi->ref_frame[1] == INTRA_FRAME) && is_interintra_allowed(mbmi);
+}
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+  return (bsize >= BLOCK_8X8);
+}
+
+static INLINE int is_motion_variation_allowed(const MB_MODE_INFO *mbmi) {
+#if CONFIG_EXT_INTER
+  return is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+         mbmi->ref_frame[1] != INTRA_FRAME;
+#else
+  return is_motion_variation_allowed_bsize(mbmi->sb_type);
+#endif  // CONFIG_EXT_INTER
+}
+
+#if CONFIG_MOTION_VAR
+static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+  return (is_inter_block(mbmi));
+}
+#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/common.h b/av1/common/common.h
index 052bb17..551055a 100644
--- a/av1/common/common.h
+++ b/av1/common/common.h

@@ -16,7 +16,6 @@
 
 #include <assert.h>
 
-#include "./aom_config.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom/aom_integer.h"
@@ -36,14 +35,14 @@
   }
 
 // Use this for variably-sized arrays.
-#define av1_copy_array(dest, src, n)       \
-  {                                        \
-    assert(sizeof(*dest) == sizeof(*src)); \
-    memcpy(dest, src, n * sizeof(*src));   \
+#define av1_copy_array(dest, src, n)           \
+  {                                            \
+    assert(sizeof(*(dest)) == sizeof(*(src))); \
+    memcpy(dest, src, n * sizeof(*(src)));     \
   }
 
 #define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
-#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;

diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index d33bfe7..a9efb1b 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h

@@ -20,140 +20,518 @@
 extern "C" {
 #endif
 
+#if CONFIG_EXT_PARTITION
+#define IF_EXT_PARTITION(...) __VA_ARGS__
+#else
+#define IF_EXT_PARTITION(...)
+#endif
+
 // Log 2 conversion lookup tables for block width and height
-static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 1, 1, 1, 2, 2,
-                                                          2, 3, 3, 3, 4, 4 };
-static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = { 0, 1, 0, 1, 2, 1, 2,
-                                                           3, 2, 3, 4, 3, 4 };
-static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] = {
-  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16
+static const uint8_t b_width_log2_lookup[BLOCK_SIZES] = {
+  0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, IF_EXT_PARTITION(4, 5, 5)
 };
-static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] = {
-  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16
+static const uint8_t b_height_log2_lookup[BLOCK_SIZES] = {
+  0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, IF_EXT_PARTITION(5, 4, 5)
 };
 // Log 2 conversion lookup tables for modeinfo width and height
-static const uint8_t mi_width_log2_lookup[BLOCK_SIZES] = { 0, 0, 0, 0, 0, 1, 1,
-                                                           1, 2, 2, 2, 3, 3 };
-static const uint8_t mi_height_log2_lookup[BLOCK_SIZES] = { 0, 0, 0, 0, 1, 0, 1,
-                                                            2, 1, 2, 3, 2, 3 };
+static const uint8_t mi_width_log2_lookup[BLOCK_SIZES] = {
+  0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, IF_EXT_PARTITION(3, 4, 4)
+};
+static const uint8_t mi_height_log2_lookup[BLOCK_SIZES] = {
+  0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, IF_EXT_PARTITION(4, 3, 4)
+};
+
+// Width/height lookup tables in units of various block sizes
+static const uint8_t block_size_wide[BLOCK_SIZES] = {
+  4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32, 64, 64, IF_EXT_PARTITION(64, 128, 128)
+};
+static const uint8_t block_size_high[BLOCK_SIZES] = {
+  4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64, 32, 64, IF_EXT_PARTITION(128, 64, 128)
+};
+static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] = {
+  1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, IF_EXT_PARTITION(16, 32, 32)
+};
+static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] = {
+  1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, IF_EXT_PARTITION(32, 16, 32)
+};
 static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = {
-  1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8
+  1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, IF_EXT_PARTITION(8, 16, 16)
 };
 static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = {
-  1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8
+  1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, IF_EXT_PARTITION(16, 8, 16)
+};
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES] = {
+  1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, IF_EXT_PARTITION(4, 8, 8)
+};
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES] = {
+  1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)
 };
 
 // AOMMIN(3, AOMMIN(b_width_log2(bsize), b_height_log2(bsize)))
-static const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2,
-                                                        2, 2, 3, 3, 3, 3 };
+static const uint8_t size_group_lookup[BLOCK_SIZES] = {
+  0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)
+};
 
 static const uint8_t num_pels_log2_lookup[BLOCK_SIZES] = {
-  4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12
+  4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, IF_EXT_PARTITION(13, 13, 14)
 };
 
-static const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
-  { // 4X4
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID },
-  { // 8X8
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID },
-  { // 16X16
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID },
-  { // 32X32
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
-    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID },
-  { // 64X64
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
-    PARTITION_NONE }
-};
-
-static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
-  {
-      // PARTITION_NONE
-      BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
-      BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64,
-      BLOCK_64X32, BLOCK_64X64,
-  },
-  {
-      // PARTITION_HORZ
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4, BLOCK_INVALID,
-      BLOCK_INVALID, BLOCK_16X8, BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
-  },
-  {
-      // PARTITION_VERT
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8, BLOCK_INVALID,
-      BLOCK_INVALID, BLOCK_8X16, BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
-  },
-  {
-      // PARTITION_SPLIT
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4, BLOCK_INVALID,
-      BLOCK_INVALID, BLOCK_8X8, BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
-      BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+/* clang-format off */
+static const PARTITION_TYPE
+  partition_lookup[MAX_SB_SIZE_LOG2 - 1][BLOCK_SIZES] = {
+  {     // 4X4 ->
+    //                                    4X4
+                                          PARTITION_NONE,
+    // 4X8,            8X4,               8X8
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 8X16,           16X8,              16X16
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 8X8 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 8X16,           16X8,              16X16
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 16X16 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 32X32 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 64X64 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+  }, {  // 128x128 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 64x128,         128x64,            128x128
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#endif  // CONFIG_EXT_PARTITION
   }
 };
 
-// transform block size in pixels
-#if CONFIG_CB4X4
-static const int tx_size_1d[TX_SIZES] = { 2, 4, 8, 16, 32 };
+#if CONFIG_EXT_PARTITION_TYPES
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES] =
+#else
+static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] =
+#endif  // CONFIG_EXT_PARTITION_TYPES
+{
+  {     // PARTITION_NONE
+    //                            4X4
+                                  BLOCK_4X4,
+    // 4X8,        8X4,           8X8
+    BLOCK_4X8,     BLOCK_8X4,     BLOCK_8X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_8X16,    BLOCK_16X8,    BLOCK_16X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_16X32,   BLOCK_32X16,   BLOCK_32X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_32X64,   BLOCK_64X32,   BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_64X128,  BLOCK_128X64,  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_HORZ
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_SPLIT
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_EXT_PARTITION_TYPES
+  }, {  // PARTITION_HORZ_A
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_HORZ_B
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT_A
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT_B
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  }
+};
 
-static const int tx_size_2d[TX_SIZES] = { 4, 16, 64, 256, 1024 };
+static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X8,    TX_8X8,   TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X16,  TX_16X16, TX_32X32,
+  // 32X64,  64X32,    64X64
+  TX_32X32,  TX_32X32, TX_32X32,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_32X32,  TX_32X32, TX_32X32,
+#endif  // CONFIG_EXT_PARTITION
+};
 
-static const uint8_t tx_size_1d_log2[TX_SIZES] = { 1, 2, 3, 4, 5 };
+static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES] = {
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X8,    TX_8X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X16,   TX_16X8,  TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X32,  TX_32X16, TX_32X32,
+  // 32X64,  64X32,    64X64
+  TX_32X32,  TX_32X32, TX_32X32,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_32X32,  TX_32X32, TX_32X32,
+#endif  // CONFIG_EXT_PARTITION
+};
 
-static const int tx_size_1d_in_unit[TX_SIZES] = { 1, 1, 2, 4, 8 };
+// Same as "max_txsize_lookup[bsize] - TX_8X8", invalid for bsize < 8X8
+static const int32_t intra_tx_size_cat_lookup[BLOCK_SIZES] = {
+  //                                      4X4
+                                          INT32_MIN,
+  // 4X8,             8X4,                8X8
+  INT32_MIN,          INT32_MIN,          TX_8X8 - TX_8X8,
+  // 8X16,            16X8,               16X16
+  TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,
+  // 16X32,           32X16,              32X32
+  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_32X32 - TX_8X8,
+  // 32X64,           64X32,              64X64
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
+};
 
-// TODO(jingning): Temporary table during the construction.
-static const int tx_size_1d_in_unit_log2[TX_SIZES] = { 0, 0, 1, 2, 3 };
-
-static const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
-  BLOCK_4X4,    // TODO(jingning): replace with BLOCK_2X2
-  BLOCK_4X4,    // TX_4X4
-  BLOCK_8X8,    // TX_8X8
-  BLOCK_16X16,  // TX_16X16
-  BLOCK_32X32,  // TX_32X32
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+// Same as "max_txsize_lookup[bsize] - TX_8X8", except for rectangular
+// block which may use a rectangular transform, in which  case it is
+// "(max_txsize_lookup[bsize] + 1) - TX_8X8", invalid for bsize < 8X8
+static const int32_t inter_tx_size_cat_lookup[BLOCK_SIZES] = {
+  //                                      4X4
+                                          INT32_MIN,
+  // 4X8,             8X4,                8X8
+  INT32_MIN,          INT32_MIN,           TX_8X8 - TX_8X8,
+  // 8X16,            16X8,               16X16
+  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
+  // 16X32,           32X16,              32X32
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+  // 32X64,           64X32,              64X64
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#if CONFIG_EXT_PARTITION
+  // 64x128,          128x64,             128x128
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
+#endif  // CONFIG_EXT_PARTITION
 };
 #else
-static const int tx_size_1d[TX_SIZES] = { 4, 8, 16, 32 };
+#define inter_tx_size_cat_lookup intra_tx_size_cat_lookup
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
-static const int tx_size_2d[TX_SIZES] = { 16, 64, 256, 1024 };
+/* clang-format on */
+
+static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_4X4,    // TX_8X8
+  TX_8X8,    // TX_16X16
+  TX_16X16,  // TX_32X32
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16   // TX_32X16
+};
+
+static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_4X4,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_32X32   // TX_32X16
+};
+
+static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_8X8,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_16X16   // TX_32X16
+};
+
+// Transform block width in pixels
+static const int tx_size_wide[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
+  4, 8, 16, 32, 4, 8, 8, 16, 16, 32,
+};
+
+// Transform block height in pixels
+static const int tx_size_high[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
+  4, 8, 16, 32, 8, 4, 16, 8, 32, 16,
+};
+
+// Transform block width in unit
+static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,
+#endif
+  1, 2, 4, 8, 1, 2, 2, 4, 4, 8,
+};
+
+// Transform block height in unit
+static const int tx_size_high_unit[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  1,
+#endif
+  1, 2, 4, 8, 2, 1, 4, 2, 8, 4,
+};
+
+// Transform block width in log2
+static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
+  2, 3, 4, 5, 2, 3, 3, 4, 4, 5,
+};
+
+// Transform block height in log2
+static const int tx_size_high_log2[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  2,
+#endif
+  2, 3, 4, 5, 3, 2, 4, 3, 5, 4,
+};
+
+static const int tx_size_2d[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  4,
+#endif
+  16, 64, 256, 1024, 32, 32, 128, 128, 512, 512,
+};
 
 static const uint8_t tx_size_1d_log2[TX_SIZES] = { 2, 3, 4, 5 };
 
-static const int tx_size_1d_in_unit[TX_SIZES] = { 1, 2, 4, 8 };
-
 // TODO(jingning): Temporary table during the construction.
 static const int tx_size_1d_in_unit_log2[TX_SIZES] = { 0, 1, 2, 3 };
 
-static const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  BLOCK_4X4,  // TX_2X2
+#endif
   BLOCK_4X4,    // TX_4X4
   BLOCK_8X8,    // TX_8X8
   BLOCK_16X16,  // TX_16X16
   BLOCK_32X32,  // TX_32X32
+  BLOCK_4X8,    // TX_4X8
+  BLOCK_8X4,    // TX_8X4
+  BLOCK_8X16,   // TX_8X16
+  BLOCK_16X8,   // TX_16X8
+  BLOCK_16X32,  // TX_16X32
+  BLOCK_32X16,  // TX_32X16
 };
-#endif
 
-static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
-  TX_4X4,   TX_4X4,   TX_4X4,   TX_8X8,   TX_8X8,   TX_8X8,  TX_16X16,
-  TX_16X16, TX_16X16, TX_32X32, TX_32X32, TX_32X32, TX_32X32
+static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_4X4,    // TX_4X8
+  TX_4X4,    // TX_8X4
+  TX_8X8,    // TX_8X16
+  TX_8X8,    // TX_16X8
+  TX_16X16,  // TX_16X32
+  TX_16X16,  // TX_32X16
+};
+
+static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+#if CONFIG_CB4X4
+  TX_2X2,  // TX_2X2
+#endif
+  TX_4X4,    // TX_4X4
+  TX_8X8,    // TX_8X8
+  TX_16X16,  // TX_16X16
+  TX_32X32,  // TX_32X32
+  TX_8X8,    // TX_4X8
+  TX_8X8,    // TX_8X4
+  TX_16X16,  // TX_8X16
+  TX_16X16,  // TX_16X8
+  TX_32X32,  // TX_16X32
+  TX_32X32,  // TX_32X16
 };
 
 static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
@@ -180,6 +558,293 @@
   { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
   { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
   { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
+#if CONFIG_EXT_PARTITION
+  { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
+  { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
+  { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
+#endif  // CONFIG_EXT_PARTITION
+};
+
+#define USE_UV_RECT_TX 1
+static const TX_SIZE uv_txsize_lookup[BLOCK_SIZES][TX_SIZES_ALL][2][2] = {
+  //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+  //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+  {
+// BLOCK_4X4
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+  },
+  {
+// BLOCK_4X8
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_4X8, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
+#else
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },      // used
+#endif  // CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_8X4
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+#if CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_8X4, TX_4X4 }, { TX_4X4, TX_4X4 } },  // used
+#else
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },      // used
+#endif  // CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_8X8
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X8, TX_8X4 }, { TX_4X8, TX_4X4 } },
+  },
+  {
+// BLOCK_8X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_4X4, TX_4X4 } },
+#if CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_8X16, TX_8X8 }, { TX_4X8, TX_4X8 } },  // used
+#else
+      { { TX_8X8, TX_8X8 }, { TX_4X4, TX_4X4 } },      // used
+#endif  // CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_16X8
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_8X8 } },
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_8X8 } },
+      { { TX_4X8, TX_4X4 }, { TX_4X8, TX_4X4 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+#if CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_16X8, TX_8X4 }, { TX_8X8, TX_8X4 } },  // used
+#else
+      { { TX_8X8, TX_4X4 }, { TX_8X8, TX_4X4 } },      // used
+#endif  // CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_16X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
+      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
+      { { TX_16X16, TX_16X8 }, { TX_8X16, TX_8X8 } },
+  },
+  {
+// BLOCK_16X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_8X8, TX_8X8 } },
+#if CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_16X32, TX_16X16 }, { TX_8X16, TX_8X16 } },  // used
+#else
+      { { TX_16X16, TX_16X16 }, { TX_8X8, TX_8X8 } },  // used
+#endif  // CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+  },
+  {
+// BLOCK_32X16
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
+      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X8 }, { TX_8X16, TX_8X8 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_INVALID, TX_INVALID }, { TX_INVALID, TX_INVALID } },
+#if CONFIG_RECT_TX && USE_UV_RECT_TX
+      { { TX_32X16, TX_16X8 }, { TX_16X16, TX_16X8 } },  // used
+#else
+      { { TX_16X16, TX_8X8 }, { TX_16X16, TX_8X8 } },  // used
+#endif  // CONFIG_RECT_TX && USE_UV_RECT_TX
+  },
+  {
+// BLOCK_32X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
+      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
+  },
+  {
+// BLOCK_32X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_16X16, TX_16X16 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X16, TX_32X16 }, { TX_16X16, TX_16X16 } },
+  },
+  {
+// BLOCK_64X32
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_16X16 }, { TX_32X32, TX_16X16 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X16 }, { TX_16X32, TX_16X16 } },
+      { { TX_32X16, TX_16X16 }, { TX_32X16, TX_16X16 } },
+  },
+  {
+// BLOCK_64X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+#if CONFIG_EXT_PARTITION
+  },
+  {
+// BLOCK_64X128
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
+      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
+  },
+  {
+// BLOCK_128X64
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
+      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
+  },
+  {
+// BLOCK_128X128
+#if CONFIG_CB4X4
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+      { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+      { { TX_8X8, TX_8X8 }, { TX_8X8, TX_8X8 } },
+      { { TX_16X16, TX_16X16 }, { TX_16X16, TX_16X16 } },
+      { { TX_32X32, TX_32X32 }, { TX_32X32, TX_32X32 } },
+      { { TX_4X8, TX_4X8 }, { TX_4X8, TX_4X8 } },
+      { { TX_8X4, TX_8X4 }, { TX_8X4, TX_8X4 } },
+      { { TX_8X16, TX_8X16 }, { TX_8X16, TX_8X16 } },
+      { { TX_16X8, TX_16X8 }, { TX_16X8, TX_16X8 } },
+      { { TX_16X32, TX_16X32 }, { TX_16X32, TX_16X32 } },
+      { { TX_32X16, TX_32X16 }, { TX_32X16, TX_32X16 } },
+#endif  // CONFIG_EXT_PARTITION
+  },
 };
 
 // Generates 4 bit field in which each bit set to 1 represents
@@ -189,6 +854,24 @@
   PARTITION_CONTEXT above;
   PARTITION_CONTEXT left;
 } partition_context_lookup[BLOCK_SIZES] = {
+#if CONFIG_EXT_PARTITION
+  { 31, 31 },  // 4X4   - {0b11111, 0b11111}
+  { 31, 30 },  // 4X8   - {0b11111, 0b11110}
+  { 30, 31 },  // 8X4   - {0b11110, 0b11111}
+  { 30, 30 },  // 8X8   - {0b11110, 0b11110}
+  { 30, 28 },  // 8X16  - {0b11110, 0b11100}
+  { 28, 30 },  // 16X8  - {0b11100, 0b11110}
+  { 28, 28 },  // 16X16 - {0b11100, 0b11100}
+  { 28, 24 },  // 16X32 - {0b11100, 0b11000}
+  { 24, 28 },  // 32X16 - {0b11000, 0b11100}
+  { 24, 24 },  // 32X32 - {0b11000, 0b11000}
+  { 24, 16 },  // 32X64 - {0b11000, 0b10000}
+  { 16, 24 },  // 64X32 - {0b10000, 0b11000}
+  { 16, 16 },  // 64X64 - {0b10000, 0b10000}
+  { 16, 0 },   // 64X128- {0b10000, 0b00000}
+  { 0, 16 },   // 128X64- {0b00000, 0b10000}
+  { 0, 0 },    // 128X128-{0b00000, 0b00000}
+#else
   { 15, 15 },  // 4X4   - {0b1111, 0b1111}
   { 15, 14 },  // 4X8   - {0b1111, 0b1110}
   { 14, 15 },  // 8X4   - {0b1110, 0b1111}
@@ -202,8 +885,33 @@
   { 8, 0 },    // 32X64 - {0b1000, 0b0000}
   { 0, 8 },    // 64X32 - {0b0000, 0b1000}
   { 0, 0 },    // 64X64 - {0b0000, 0b0000}
+#endif  // CONFIG_EXT_PARTITION
 };
 
+#if CONFIG_SUPERTX
+static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
+//  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
+//  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
+#if CONFIG_CB4X4
+  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+#endif
+  { { TX_4X4, TX_4X4 }, { TX_4X4, TX_4X4 } },
+  { { TX_8X8, TX_4X4 }, { TX_4X4, TX_4X4 } },
+  { { TX_16X16, TX_8X8 }, { TX_8X8, TX_8X8 } },
+  { { TX_32X32, TX_16X16 }, { TX_16X16, TX_16X16 } },
+};
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const int partition_supertx_context_lookup[EXT_PARTITION_TYPES] = {
+  -1, 0, 0, 1, 0, 0, 0, 0
+};
+
+#else
+static const int partition_supertx_context_lookup[PARTITION_TYPES] = { -1, 0, 0,
+                                                                       1 };
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_SUPERTX
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 90bfcca..eef629e 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c

@@ -13,8 +13,8 @@
 #include <string.h>
 
 #include "./av1_rtcd.h"
+#include "av1/common/convolve.h"
 #include "av1/common/filter.h"
-#include "av1/common/enums.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
@@ -34,8 +34,8 @@
     int x_q4 = subpel_x_q4;
     for (x = 0; x < w; ++x) {
       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *x_filter =
-          get_interp_filter_subpel_kernel(filter_params, x_q4 & SUBPEL_MASK);
+      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, x_q4 & SUBPEL_MASK);
       int k, sum = 0;
       for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
       if (avg) {
@@ -63,8 +63,8 @@
     int y_q4 = subpel_y_q4;
     for (y = 0; y < h; ++y) {
       const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *y_filter =
-          get_interp_filter_subpel_kernel(filter_params, y_q4 & SUBPEL_MASK);
+      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, y_q4 & SUBPEL_MASK);
       int k, sum = 0;
       for (k = 0; k < filter_size; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
@@ -106,9 +106,13 @@
 
 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
                   int dst_stride, int w, int h,
-                  const InterpFilter *interp_filter, const int subpel_x_q4,
-                  int x_step_q4, const int subpel_y_q4, int y_step_q4,
-                  int ref_idx) {
+#if CONFIG_DUAL_FILTER
+                  const InterpFilter *interp_filter,
+#else
+                  const InterpFilter interp_filter,
+#endif
+                  const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
+                  int y_step_q4, int ref_idx) {
   int ignore_horiz = x_step_q4 == 16 && subpel_x_q4 == 0;
   int ignore_vert = y_step_q4 == 16 && subpel_y_q4 == 0;
 
@@ -120,12 +124,24 @@
   if (ignore_horiz && ignore_vert) {
     convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx);
   } else if (ignore_vert) {
-    InterpFilterParams filter_params = get_interp_filter_params(*interp_filter);
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
+#endif
     assert(filter_params.taps <= MAX_FILTER_TAP);
     av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
                        subpel_x_q4, x_step_q4, ref_idx);
   } else if (ignore_horiz) {
-    InterpFilterParams filter_params = get_interp_filter_params(*interp_filter);
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter[2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
+#endif
     assert(filter_params.taps <= MAX_FILTER_TAP);
     av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
                       subpel_y_q4, y_step_q4, ref_idx);
@@ -136,17 +152,39 @@
                   MAX_FILTER_TAP) *
                  MAX_BLOCK_WIDTH];
     int temp_stride = MAX_BLOCK_WIDTH;
-    InterpFilterParams filter_params = get_interp_filter_params(*interp_filter);
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params_x =
+        av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+    InterpFilterParams filter_params_y =
+        av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+    InterpFilterParams filter_params = filter_params_x;
+
+    // The filter size implies the required number of reference pixels for
+    // the second stage filtering. It is possible that the two directions
+    // require different filter sizes.
+    int filter_size = filter_params_y.taps;
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
     int filter_size = filter_params.taps;
+#endif
     int intermediate_height =
         (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
 
-    assert(filter_size <= MAX_FILTER_TAP);
+    assert(filter_params.taps <= MAX_FILTER_TAP);
 
     av1_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
                        temp, temp_stride, w, intermediate_height, filter_params,
                        subpel_x_q4, x_step_q4, 0);
 
+#if CONFIG_DUAL_FILTER
+    filter_params = filter_params_y;
+#else
+    filter_params = av1_get_interp_filter_params(interp_filter);
+#endif
+    filter_size = filter_params.taps;
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+
     av1_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
                       dst, dst_stride, w, h, filter_params, subpel_y_q4,
                       y_step_q4, ref_idx);
@@ -166,8 +204,8 @@
     int x_q4 = subpel_x_q4;
     for (x = 0; x < w; ++x) {
       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *x_filter =
-          get_interp_filter_subpel_kernel(filter_params, x_q4 & SUBPEL_MASK);
+      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, x_q4 & SUBPEL_MASK);
       int k, sum = 0;
       for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
       if (avg)
@@ -197,8 +235,8 @@
     int y_q4 = subpel_y_q4;
     for (y = 0; y < h; ++y) {
       const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *y_filter =
-          get_interp_filter_subpel_kernel(filter_params, y_q4 & SUBPEL_MASK);
+      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params, y_q4 & SUBPEL_MASK);
       int k, sum = 0;
       for (k = 0; k < filter_size; ++k)
         sum += src_y[k * src_stride] * y_filter[k];
@@ -242,7 +280,11 @@
 
 void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
                          int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
                          const InterpFilter *interp_filter,
+#else
+                         const InterpFilter interp_filter,
+#endif
                          const int subpel_x_q4, int x_step_q4,
                          const int subpel_y_q4, int y_step_q4, int ref_idx,
                          int bd) {
@@ -259,12 +301,24 @@
   if (ignore_horiz && ignore_vert) {
     highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
   } else if (ignore_vert) {
-    InterpFilterParams filter_params = get_interp_filter_params(*interp_filter);
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
+#endif
     av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
                               filter_params, subpel_x_q4, x_step_q4, ref_idx,
                               bd);
   } else if (ignore_horiz) {
-    InterpFilterParams filter_params = get_interp_filter_params(*interp_filter);
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
+#endif
     av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
                              filter_params, subpel_y_q4, y_step_q4, ref_idx,
                              bd);
@@ -276,16 +330,32 @@
                   MAX_BLOCK_WIDTH];
     int temp_stride = MAX_BLOCK_WIDTH;
 
-    InterpFilterParams filter_params = get_interp_filter_params(*interp_filter);
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params_x =
+        av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+    InterpFilterParams filter_params_y =
+        av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+    InterpFilterParams filter_params = filter_params_x;
+    int filter_size = filter_params_y.taps;
+#else
+    InterpFilterParams filter_params =
+        av1_get_interp_filter_params(interp_filter);
     int filter_size = filter_params.taps;
+#endif
+
     int intermediate_height =
         (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
-    assert(filter_size <= MAX_FILTER_TAP);
 
     av1_highbd_convolve_horiz(
         src - src_stride * (filter_size / 2 - 1), src_stride, temp, temp_stride,
         w, intermediate_height, filter_params, subpel_x_q4, x_step_q4, 0, bd);
 
+#if CONFIG_DUAL_FILTER
+    filter_params = filter_params_y;
+#endif
+    filter_size = filter_params.taps;
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+
     av1_highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
                              temp_stride, dst, dst_stride, w, h, filter_params,
                              subpel_y_q4, y_step_q4, ref_idx, bd);

diff --git a/av1/common/convolve.h b/av1/common/convolve.h
index 0b27a3a..dafa032 100644
--- a/av1/common/convolve.h
+++ b/av1/common/convolve.h

@@ -9,8 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#ifndef AV1_COMMON_CONVOLVE_H_
-#define AV1_COMMON_CONVOLVE_H_
+#ifndef AV1_COMMON_AV1_CONVOLVE_H_
+#define AV1_COMMON_AV1_CONVOLVE_H_
 #include "av1/common/filter.h"
 
 #ifdef __cplusplus
@@ -19,19 +19,28 @@
 
 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
                   int dst_stride, int w, int h,
-                  const InterpFilter *interp_filter, const int subpel_x,
-                  const int subpel_y, int xstep, int ystep, int avg);
+#if CONFIG_DUAL_FILTER
+                  const InterpFilter *interp_filter,
+#else
+                  const InterpFilter interp_filter,
+#endif
+                  const int subpel_x, int xstep, const int subpel_y, int ystep,
+                  int avg);
 
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
                          int dst_stride, int w, int h,
-                         const InterpFilter *interp_filter, const int subpel_x,
-                         const int subpel_y, int xstep, int ystep, int avg,
-                         int bd);
+#if CONFIG_DUAL_FILTER
+                         const InterpFilter *interp_filter,
+#else
+                         const InterpFilter interp_filter,
+#endif
+                         const int subpel_x, int xstep, const int subpel_y,
+                         int ystep, int avg, int bd);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_COMMON_AOM_CONVOLVE_H_
+#endif  // AV1_COMMON_AV1_CONVOLVE_H_

diff --git a/av1/common/entropy.c b/av1/common/entropy.c
index 16ade81..95368c6 100644
--- a/av1/common/entropy.c
+++ b/av1/common/entropy.c

@@ -19,24 +19,18 @@
 #include "aom/aom_integer.h"
 
 // Unconstrained Node Tree
+/* clang-format off */
 const aom_tree_index av1_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2,
-  6,  // 0 = LOW_VAL
-  -TWO_TOKEN,
-  4,  // 1 = TWO
-  -THREE_TOKEN,
-  -FOUR_TOKEN,  // 2 = THREE
-  8,
-  10,  // 3 = HIGH_LOW
-  -CATEGORY1_TOKEN,
-  -CATEGORY2_TOKEN,  // 4 = CAT_ONE
-  12,
-  14,  // 5 = CAT_THREEFOUR
-  -CATEGORY3_TOKEN,
-  -CATEGORY4_TOKEN,  // 6 = CAT_THREE
-  -CATEGORY5_TOKEN,
-  -CATEGORY6_TOKEN  // 7 = CAT_FIVE
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
 };
+/* clang-format on */
 
 const aom_prob av1_cat1_prob[] = { 159 };
 const aom_prob av1_cat2_prob[] = { 165, 145 };
@@ -64,6 +58,28 @@
                                           177, 153, 140, 133, 130, 129 };
 #endif
 
+const uint16_t band_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+  { 1, 2, 2, 3, 0, 0, 0 },
+#endif
+  { 1, 2, 3, 4, 3, 16 - 13, 0 },   { 1, 2, 3, 4, 11, 64 - 21, 0 },
+  { 1, 2, 3, 4, 11, 256 - 21, 0 }, { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+  { 1, 2, 3, 4, 8, 32 - 18, 0 },   { 1, 2, 3, 4, 8, 32 - 18, 0 },
+  { 1, 2, 3, 4, 11, 128 - 21, 0 }, { 1, 2, 3, 4, 11, 128 - 21, 0 },
+  { 1, 2, 3, 4, 11, 512 - 21, 0 }, { 1, 2, 3, 4, 11, 512 - 21, 0 },
+};
+
+const uint16_t band_cum_count_table[TX_SIZES_ALL][8] = {
+#if CONFIG_CB4X4
+  { 0, 1, 3, 6, 10, 13, 16, 0 },
+#endif
+  { 0, 1, 3, 6, 10, 13, 16, 0 },  { 0, 1, 3, 6, 10, 21, 64, 0 },
+  { 0, 1, 3, 6, 10, 21, 256, 0 }, { 0, 1, 3, 6, 10, 21, 1024, 0 },
+  { 0, 1, 3, 6, 10, 18, 32, 0 },  { 0, 1, 3, 6, 10, 18, 32, 0 },
+  { 0, 1, 3, 6, 10, 21, 128, 0 }, { 0, 1, 3, 6, 10, 21, 128, 0 },
+  { 0, 1, 3, 6, 10, 21, 512, 0 }, { 0, 1, 3, 6, 10, 21, 512, 0 },
+};
+
 const uint8_t av1_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -108,6 +124,13 @@
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
+#if CONFIG_EXT_TX
+const uint8_t av1_coefband_trans_4x8_8x4[32] = {
+  0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
+  4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+};
+#endif  // CONFIG_EXT_TX
+
 const uint8_t av1_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
 };
@@ -115,7 +138,7 @@
 const uint8_t av1_pt_energy_class[ENTROPY_TOKENS] = { 0, 1, 2, 3, 3, 4,
                                                       4, 5, 5, 5, 5, 5 };
 
-// Model obtained from a 2-sided zero-centerd distribuition derived
+// Model obtained from a 2-sided zero-centered distribution derived
 // from a Pareto distribution. The cdf of the distribution is:
 // cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
 //
@@ -386,6 +409,7 @@
   { 255, 243, 245, 255, 237, 255, 252, 254 },
   { 255, 246, 247, 255, 239, 255, 253, 255 },
 };
+
 #if CONFIG_EC_MULTISYMBOL
 // Model obtained from a 2-sided zero-centered distribution derived
 // from a Pareto distribution. The cdf of the distribution is:
@@ -660,671 +684,2121 @@
     };
 #endif  // CONFIG_EC_MULTISYMBOL
 
+/* clang-format off */
+#if CONFIG_ENTROPY
+const av1_coeff_probs_model
+default_qctx_coef_probs[QCTX_BINS][TX_SIZES][PLANE_TYPES] = {
+    {  // Q_Index 0
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {182,  34, 137}, { 79,  39, 103}, { 10,  28,  51},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 45,  88, 147}, { 46,  80, 140}, { 25,  69, 119},
+                        { 12,  57,  96}, {  4,  41,  65}, {  1,  20,  31},
+                    },
+                    {  // band 2
+                        { 58, 124, 190}, { 39, 106, 178}, { 16,  86, 147},
+                        {  7,  69, 114}, {  3,  50,  80}, {  1,  25,  42},
+                    },
+                    {  // band 3
+                        { 90, 138, 215}, { 54, 116, 198}, { 18,  86, 155},
+                        {  5,  62, 112}, {  1,  38,  68}, {  1,  17,  30},
+                    },
+                    {  // band 4
+                        {126, 149, 231}, { 82, 114, 211}, { 21,  80, 157},
+                        {  6,  56, 105}, {  1,  36,  64}, {  1,  17,  31},
+                    },
+                    {  // band 5
+                        {171,  56, 236}, {140,  54, 219}, { 57,  45, 167},
+                        { 26,  36, 113}, { 11,  29,  72}, {  3,  18,  39},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {153, 122, 186}, {106, 109, 171}, { 36,  84, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 27, 151, 201}, { 34, 131, 199}, { 23, 102, 161},
+                        { 10,  80, 120}, {  4,  52,  78}, {  1,  24,  37},
+                    },
+                    {  // band 2
+                        { 43, 158, 213}, { 35, 133, 203}, {  8,  92, 151},
+                        {  2,  64, 106}, {  1,  36,  60}, {  1,  13,  24},
+                    },
+                    {  // band 3
+                        { 68, 167, 223}, { 36, 135, 211}, {  9,  94, 157},
+                        {  2,  67, 112}, {  1,  40,  68}, {  1,  17,  31},
+                    },
+                    {  // band 4
+                        {131, 146, 237}, { 72, 119, 223}, { 17,  82, 164},
+                        {  4,  55, 107}, {  1,  34,  63}, {  1,  16,  29},
+                    },
+                    {  // band 5
+                        {184,  68, 244}, {153,  59, 232}, { 68,  51, 179},
+                        { 31,  40, 123}, { 13,  29,  77}, {  4,  17,  37},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {203,  41, 203}, {127,  56, 174}, { 49,  56, 127},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {110, 121, 217}, {119, 113, 213}, { 64,  95, 185},
+                        { 30,  72, 144}, {  8,  42,  76}, {  2,  17,  25},
+                    },
+                    {  // band 2
+                        {127, 159, 229}, {115, 134, 223}, { 36, 100, 189},
+                        { 11,  75, 142}, {  3,  48,  83}, {  1,  19,  33},
+                    },
+                    {  // band 3
+                        {150, 172, 241}, { 90, 133, 231}, { 28, 102, 192},
+                        {  7,  81, 147}, {  1,  53,  91}, {  1,  25,  42},
+                    },
+                    {  // band 4
+                        {184, 144, 248}, {114, 117, 237}, { 37,  89, 192},
+                        { 10,  63, 130}, {  4,  42,  76}, {  1,  19,  38},
+                    },
+                    {  // band 5
+                        {207,  79, 250}, {179,  74, 241}, { 83,  67, 199},
+                        { 38,  51, 142}, { 17,  37,  97}, { 10,  14,  55},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {220,  82, 232}, {150,  93, 214}, { 66,  95, 177},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {116, 160, 227}, {136, 141, 227}, { 67, 114, 190},
+                        { 40,  94, 148}, { 21,  70, 107}, { 10,  43,  63},
+                    },
+                    {  // band 2
+                        {124, 173, 235}, {105, 147, 226}, { 27, 107, 184},
+                        { 10,  80, 142}, {  3,  50,  86}, {  1,  16,  32},
+                    },
+                    {  // band 3
+                        {149, 179, 243}, { 89, 147, 234}, { 29, 112, 193},
+                        {  9,  94, 157}, {  1,  64, 111}, {  1,  25,  43},
+                    },
+                    {  // band 4
+                        {187, 153, 248}, {127, 130, 241}, { 52,  99, 202},
+                        { 20,  79, 152}, {  4,  50,  93}, {  1,  19,  32},
+                    },
+                    {  // band 5
+                        {215,  82, 251}, {195,  80, 246}, { 93,  70, 204},
+                        { 39,  54, 147}, { 14,  33,  88}, {  6,  14,  39},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {116,  43, 131}, { 39,  41,  94}, {  4,  28,  47},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 28, 101, 141}, { 27,  95, 140}, { 18,  80, 121},
+                        { 10,  61,  95}, {  4,  39,  60}, {  1,  19,  26},
+                    },
+                    {  // band 2
+                        { 29, 150, 183}, { 19, 127, 175}, {  8,  98, 147},
+                        {  3,  76, 115}, {  1,  55,  84}, {  1,  29,  43},
+                    },
+                    {  // band 3
+                        { 26, 168, 202}, { 12, 138, 188}, {  2,  98, 149},
+                        {  1,  69, 110}, {  1,  40,  65}, {  1,  17,  25},
+                    },
+                    {  // band 4
+                        { 33, 188, 225}, { 12, 155, 207}, {  2, 101, 155},
+                        {  1,  65, 106}, {  1,  36,  60}, {  1,  18,  26},
+                    },
+                    {  // band 5
+                        { 79, 205, 242}, { 30, 168, 224}, {  5, 106, 164},
+                        {  1,  68, 110}, {  1,  39,  65}, {  1,  18,  28},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 96,  80, 201}, { 51,  88, 168}, { 14,  78, 116},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  6, 167, 216}, { 32, 152, 211}, { 24, 121, 182},
+                        { 13,  98, 149}, { 12,  76, 108}, {  8,  48,  62},
+                    },
+                    {  // band 2
+                        { 17, 176, 225}, { 13, 147, 209}, {  3,  96, 155},
+                        {  1,  65, 108}, {  2,  43,  63}, {  2,  23,  25},
+                    },
+                    {  // band 3
+                        { 18, 183, 232}, { 10, 153, 214}, {  1,  96, 154},
+                        {  1,  63, 105}, {  1,  39,  59}, {  1,  21,  24},
+                    },
+                    {  // band 4
+                        { 23, 191, 239}, {  8, 159, 221}, {  1,  97, 158},
+                        {  1,  61, 105}, {  1,  37,  60}, {  1,  20,  26},
+                    },
+                    {  // band 5
+                        { 70, 201, 243}, { 29, 163, 228}, {  4, 102, 169},
+                        {  1,  67, 114}, {  1,  39,  66}, {  1,  17,  29},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {181,  38, 192}, { 95,  47, 151}, { 29,  49, 102},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 72, 131, 202}, { 93, 120, 205}, { 50, 103, 179},
+                        { 24,  79, 143}, { 11,  47,  78}, {  7,  19,  25},
+                    },
+                    {  // band 2
+                        { 84, 176, 221}, { 56, 144, 214}, { 21, 108, 182},
+                        {  8,  83, 139}, {  3,  55,  90}, {  2,  27,  41},
+                    },
+                    {  // band 3
+                        { 84, 195, 234}, { 42, 156, 222}, { 10, 109, 180},
+                        {  4,  77, 133}, {  1,  48,  80}, {  1,  23,  35},
+                    },
+                    {  // band 4
+                        { 89, 210, 238}, { 35, 165, 221}, {  6, 106, 172},
+                        {  2,  70, 123}, {  1,  44,  74}, {  1,  21,  30},
+                    },
+                    {  // band 5
+                        {114, 221, 247}, { 49, 170, 234}, {  7, 113, 184},
+                        {  2,  77, 132}, {  1,  48,  79}, {  1,  25,  33},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {192,  66, 237}, {113,  84, 211}, { 35,  84, 154},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 81, 180, 234}, {127, 165, 229}, { 58, 137, 204},
+                        { 41, 114, 174}, { 44,  94, 136}, { 29,  66,  86},
+                    },
+                    {  // band 2
+                        { 82, 193, 240}, { 39, 162, 223}, {  8, 113, 179},
+                        {  3,  83, 136}, {  6,  62,  84}, {  5,  45,  45},
+                    },
+                    {  // band 3
+                        { 78, 203, 242}, { 31, 170, 227}, {  4, 115, 181},
+                        {  1,  82, 135}, {  2,  59,  82}, {  1,  45,  47},
+                    },
+                    {  // band 4
+                        { 76, 210, 239}, { 25, 170, 213}, {  2,  99, 152},
+                        {  1,  69, 115}, {  1,  49,  80}, {  1,  47,  57},
+                    },
+                    {  // band 5
+                        {103, 217, 250}, { 42, 180, 237}, {  3, 124, 191},
+                        {  1,  90, 150}, {  1,  69, 116}, {  1,  52,  46},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 58,  38,  99}, {  9,  26,  51}, {  1,  14,  22},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 14,  78, 109}, { 16,  73, 105}, { 11,  62,  92},
+                        {  6,  47,  72}, {  2,  29,  45}, {  1,  12,  18},
+                    },
+                    {  // band 2
+                        { 17, 131, 148}, { 11, 112, 140}, {  5,  87, 118},
+                        {  2,  63,  90}, {  1,  42,  63}, {  1,  19,  31},
+                    },
+                    {  // band 3
+                        { 12, 151, 168}, {  6, 116, 152}, {  1,  76, 115},
+                        {  1,  50,  81}, {  1,  32,  52}, {  1,  14,  23},
+                    },
+                    {  // band 4
+                        { 10, 174, 191}, {  3, 130, 172}, {  1,  80, 126},
+                        {  1,  53,  88}, {  1,  32,  55}, {  1,  14,  24},
+                    },
+                    {  // band 5
+                        { 19, 219, 237}, {  3, 168, 211}, {  1,  90, 142},
+                        {  1,  53,  91}, {  1,  29,  51}, {  1,  12,  21},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 21,  46, 184}, { 10,  53, 130}, {  2,  49,  78},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  3, 169, 198}, { 37, 165, 196}, { 26, 134, 176},
+                        { 11, 108, 149}, {  5,  81, 112}, {  3,  47,  64},
+                    },
+                    {  // band 2
+                        { 11, 183, 215}, {  8, 142, 192}, {  2,  91, 141},
+                        {  1,  62, 100}, {  1,  38,  62}, {  1,  17,  28},
+                    },
+                    {  // band 3
+                        { 12, 190, 223}, {  6, 149, 199}, {  1,  88, 139},
+                        {  1,  56,  93}, {  1,  31,  54}, {  1,  13,  21},
+                    },
+                    {  // band 4
+                        { 11, 197, 230}, {  3, 154, 204}, {  1,  83, 134},
+                        {  1,  50,  86}, {  1,  28,  49}, {  1,  12,  21},
+                    },
+                    {  // band 5
+                        { 17, 211, 240}, {  2, 167, 217}, {  1,  88, 143},
+                        {  1,  53,  91}, {  1,  30,  53}, {  1,  14,  24},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {151,  30, 151}, { 50,  36, 105}, {  8,  34,  66},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 39, 111, 160}, { 62, 111, 165}, { 37,  99, 147},
+                        { 15,  77, 118}, {  3,  47,  73}, {  1,  17,  27},
+                    },
+                    {  // band 2
+                        { 48, 170, 190}, { 32, 135, 180}, { 11, 100, 149},
+                        {  4,  76, 116}, {  1,  51,  80}, {  1,  22,  36},
+                    },
+                    {  // band 3
+                        { 39, 191, 208}, { 18, 141, 191}, {  3,  96, 150},
+                        {  1,  66, 110}, {  1,  41,  69}, {  1,  17,  28},
+                    },
+                    {  // band 4
+                        { 32, 209, 219}, {  8, 152, 201}, {  1,  96, 153},
+                        {  1,  63, 106}, {  1,  38,  66}, {  1,  17,  29},
+                    },
+                    {  // band 5
+                        { 33, 230, 237}, {  5, 173, 214}, {  1, 100, 155},
+                        {  1,  62, 105}, {  1,  38,  66}, {  1,  18,  32},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {149,  38, 231}, { 59,  51, 186}, { 12,  54, 117},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 53, 179, 226}, {126, 176, 223}, { 58, 147, 202},
+                        { 28, 118, 174}, { 15,  94, 138}, { 14,  63,  87},
+                    },
+                    {  // band 2
+                        { 58, 196, 232}, { 26, 158, 213}, {  5, 106, 166},
+                        {  1,  75, 124}, {  1,  46,  79}, {  1,  23,  39},
+                    },
+                    {  // band 3
+                        { 46, 203, 235}, { 17, 162, 213}, {  2, 104, 165},
+                        {  1,  72, 120}, {  1,  44,  74}, {  1,  22,  33},
+                    },
+                    {  // band 4
+                        { 37, 213, 238}, {  8, 167, 216}, {  1, 104, 168},
+                        {  1,  68, 119}, {  1,  40,  67}, {  1,  17,  29},
+                    },
+                    {  // band 5
+                        { 30, 228, 239}, {  4, 181, 213}, {  1, 103, 153},
+                        {  1,  65, 110}, {  1,  43,  79}, {  1,  27,  56},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 76,  25,  53}, {  9,  18,  32}, {  1,  12,  18},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 29,  55,  91}, { 19,  58,  95}, { 15,  57,  89},
+                        { 12,  49,  77}, {  3,  29,  44}, {  1,   8,  12},
+                    },
+                    {  // band 2
+                        { 32, 160, 148}, { 33, 143, 146}, { 19, 122, 132},
+                        {  6,  90, 102}, {  1,  58,  70}, {  1,  17,  24},
+                    },
+                    {  // band 3
+                        { 16, 181, 181}, {  6, 142, 165}, {  1,  90, 120},
+                        {  1,  50,  71}, {  1,  25,  38}, {  1,   9,  14},
+                    },
+                    {  // band 4
+                        { 13, 203, 203}, {  3, 154, 176}, {  1,  80, 108},
+                        {  1,  41,  61}, {  1,  24,  37}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 234, 240}, {  1, 178, 204}, {  1,  80, 119},
+                        {  1,  45,  71}, {  1,  26,  42}, {  1,  12,  19},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 78,  20, 135}, { 25,  18, 101}, {  5,  19,  57},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  7, 144, 183}, {117, 151, 195}, {109, 151, 187},
+                        { 39, 130, 168}, { 11, 100, 125}, {  4,  59,  64},
+                    },
+                    {  // band 2
+                        { 20, 184, 212}, { 12, 148, 191}, {  2,  98, 141},
+                        {  1,  65, 100}, {  1,  39,  61}, {  1,  14,  22},
+                    },
+                    {  // band 3
+                        { 15, 194, 222}, {  6, 153, 198}, {  1,  92, 138},
+                        {  1,  58,  91}, {  1,  32,  52}, {  1,  12,  18},
+                    },
+                    {  // band 4
+                        { 14, 206, 232}, {  3, 162, 206}, {  1,  89, 134},
+                        {  1,  52,  83}, {  1,  28,  46}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 225, 241}, {  1, 175, 210}, {  1,  81, 125},
+                        {  1,  48,  78}, {  1,  28,  46}, {  1,  13,  21},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {124,  23,  93}, { 31,  24,  63}, {  6,  24,  46},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 23,  86, 126}, { 45,  90, 145}, { 31,  91, 133},
+                        { 19,  80, 114}, {  7,  53,  72}, {  1,  20,  27},
+                    },
+                    {  // band 2
+                        { 51, 186, 189}, { 48, 159, 182}, { 33, 128, 156},
+                        { 15,  92, 124}, {  2,  62,  83}, {  1,  29,  43},
+                    },
+                    {  // band 3
+                        { 36, 198, 211}, { 15, 156, 187}, {  3,  97, 137},
+                        {  1,  61,  93}, {  1,  35,  57}, {  1,  15,  23},
+                    },
+                    {  // band 4
+                        { 34, 219, 223}, {  9, 162, 193}, {  1,  91, 136},
+                        {  1,  58,  92}, {  1,  35,  54}, {  1,  14,  23},
+                    },
+                    {  // band 5
+                        { 19, 243, 243}, {  3, 191, 208}, {  1,  91, 137},
+                        {  1,  56,  90}, {  1,  34,  55}, {  1,  16,  24},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {119,  20, 197}, { 19,  29, 156}, {  3,  30, 107},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 24, 192, 226}, {161, 193, 227}, { 97, 185, 222},
+                        { 31, 158, 204}, { 16, 122, 165}, { 17,  84, 112},
+                    },
+                    {  // band 2
+                        { 26, 202, 229}, { 11, 165, 210}, {  2, 103, 152},
+                        {  1,  68, 104}, {  1,  42,  70}, {  1,  16,  36},
+                    },
+                    {  // band 3
+                        { 24, 209, 237}, {  6, 169, 214}, {  1, 102, 154},
+                        {  1,  65, 107}, {  1,  45,  68}, {  1,  17,  24},
+                    },
+                    {  // band 4
+                        { 19, 219, 243}, {  4, 183, 226}, {  1, 115, 172},
+                        {  1,  73, 119}, {  1,  43,  77}, {  1,  15,  37},
+                    },
+                    {  // band 5
+                        { 11, 237, 241}, {  2, 190, 216}, {  1, 108, 146},
+                        {  1,  59,  94}, {  1,  40,  67}, {  1,  30,  53},
+                    },
+                },
+            },
+        },
+    },
+    {  // Q_Index 1
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {174,  30, 159}, { 76,  38, 115}, { 15,  33,  65},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 60,  80, 153}, { 72,  75, 147}, { 36,  68, 129},
+                        { 15,  59, 104}, {  4,  45,  74}, {  1,  28,  45},
+                    },
+                    {  // band 2
+                        { 70, 122, 186}, { 55, 104, 175}, { 21,  83, 144},
+                        {  8,  67, 112}, {  2,  51,  82}, {  1,  34,  57},
+                    },
+                    {  // band 3
+                        { 97, 144, 207}, { 52, 109, 195}, { 16,  77, 153},
+                        {  4,  58, 113}, {  1,  43,  77}, {  1,  27,  48},
+                    },
+                    {  // band 4
+                        {128, 148, 229}, { 76, 104, 210}, { 18,  77, 159},
+                        {  4,  65, 110}, {  1,  52,  82}, {  1,  31,  55},
+                    },
+                    {  // band 5
+                        {165,  51, 238}, {128,  50, 230}, { 57,  49, 185},
+                        { 28,  47, 130}, { 12,  44,  96}, {  3,  36,  60},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {169, 103, 203}, {117,  96, 176}, { 56,  81, 137},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 31, 150, 224}, { 49, 128, 212}, { 19,  92, 165},
+                        {  6,  67, 116}, {  2,  43,  71}, {  1,  21,  36},
+                    },
+                    {  // band 2
+                        { 58, 156, 230}, { 47, 130, 215}, {  7,  87, 158},
+                        {  2,  63, 114}, {  1,  39,  71}, {  1,  18,  36},
+                    },
+                    {  // band 3
+                        { 85, 176, 240}, { 43, 138, 226}, {  8,  93, 172},
+                        {  2,  70, 127}, {  1,  46,  81}, {  1,  26,  47},
+                    },
+                    {  // band 4
+                        {155, 144, 248}, { 93, 116, 235}, { 21,  83, 180},
+                        {  4,  59, 119}, {  1,  43,  80}, {  1,  25,  50},
+                    },
+                    {  // band 5
+                        {203,  61, 250}, {171,  57, 243}, { 71,  57, 199},
+                        { 31,  49, 144}, { 13,  42,  96}, {  7,  30,  52},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {204,  44, 204}, {137,  57, 184}, { 72,  62, 152},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {145, 117, 236}, {151, 112, 231}, { 87,  95, 208},
+                        { 31,  77, 165}, {  5,  49,  98}, {  1,  24,  39},
+                    },
+                    {  // band 2
+                        {146, 152, 241}, {140, 132, 236}, { 41, 103, 209},
+                        { 10,  86, 165}, {  2,  55, 106}, {  1,  25,  58},
+                    },
+                    {  // band 3
+                        {154, 181, 249}, { 84, 143, 240}, { 23, 114, 210},
+                        {  6, 102, 182}, {  2,  71, 137}, {  1,  35,  90},
+                    },
+                    {  // band 4
+                        {184, 150, 251}, {115, 130, 244}, { 34, 105, 215},
+                        { 15,  89, 173}, {  1,  51, 141}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {211,  71, 253}, {193,  78, 249}, {106,  91, 232},
+                        { 61,  87, 198}, { 85, 153, 254}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {232, 104, 242}, {165, 114, 227}, { 96, 120, 206},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {137, 178, 250}, {146, 153, 245}, { 74, 108, 205},
+                        { 41,  81, 149}, { 24,  55, 104}, { 13,  36,  68},
+                    },
+                    {  // band 2
+                        {147, 185, 252}, {127, 161, 246}, { 30, 104, 208},
+                        { 11,  74, 154}, {  6,  54, 100}, {  2,  29,  63},
+                    },
+                    {  // band 3
+                        {163, 191, 254}, {101, 161, 249}, { 22, 114, 215},
+                        {  6,  89, 173}, {  1,  65, 120}, {  1,   1, 170},
+                    },
+                    {  // band 4
+                        {197, 160, 254}, {142, 141, 251}, { 39, 102, 218},
+                        { 10,  76, 158}, {  1,  56, 122}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {224,  76, 254}, {215,  84, 253}, {107,  85, 232},
+                        { 43,  71, 177}, {  1,   1, 254}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 68,  37, 120}, { 21,  34,  82}, {  5,  26,  49},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 41,  89, 138}, { 56,  83, 132}, { 31,  73, 115},
+                        { 16,  62,  92}, {  5,  45,  62}, {  1,  24,  32},
+                    },
+                    {  // band 2
+                        { 48, 139, 165}, { 30, 114, 160}, { 13,  92, 132},
+                        {  6,  72, 103}, {  3,  49,  72}, {  1,  26,  41},
+                    },
+                    {  // band 3
+                        { 44, 162, 191}, { 20, 127, 175}, {  5,  90, 137},
+                        {  1,  62, 100}, {  1,  38,  63}, {  1,  20,  32},
+                    },
+                    {  // band 4
+                        { 51, 184, 213}, { 16, 137, 193}, {  2,  89, 143},
+                        {  1,  60, 102}, {  1,  39,  66}, {  1,  23,  37},
+                    },
+                    {  // band 5
+                        { 76, 200, 235}, { 27, 150, 216}, {  3,  99, 164},
+                        {  1,  70, 119}, {  1,  45,  77}, {  1,  22,  38},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 81, 112, 199}, { 49, 101, 164}, { 19,  80, 119},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 12, 181, 217}, { 48, 151, 212}, { 38, 118, 180},
+                        { 22,  95, 140}, { 11,  67,  92}, { 13,  46,  44},
+                    },
+                    {  // band 2
+                        { 29, 188, 226}, { 19, 147, 210}, {  5,  95, 154},
+                        {  4,  68, 106}, {  3,  44,  60}, {  1,  24,  27},
+                    },
+                    {  // band 3
+                        { 30, 195, 234}, { 15, 153, 216}, {  3,  95, 156},
+                        {  2,  66, 108}, {  2,  44,  62}, {  1,  24,  29},
+                    },
+                    {  // band 4
+                        { 36, 203, 243}, { 12, 162, 225}, {  2,  98, 163},
+                        {  2,  67, 113}, {  2,  45,  68}, {  1,  24,  34},
+                    },
+                    {  // band 5
+                        { 86, 207, 248}, { 35, 165, 236}, {  3, 107, 180},
+                        {  1,  73, 128}, {  1,  45,  78}, {  1,  20,  34},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {188,  37, 205}, {118,  51, 172}, { 56,  57, 135},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {116, 135, 225}, {144, 123, 221}, { 72, 103, 197},
+                        { 35,  77, 153}, { 15,  47,  82}, {  6,  25,  34},
+                    },
+                    {  // band 2
+                        {128, 171, 233}, { 82, 142, 226}, { 31, 106, 191},
+                        { 16,  82, 146}, {  9,  59,  98}, {  4,  33,  54},
+                    },
+                    {  // band 3
+                        {126, 197, 241}, { 66, 155, 230}, { 18, 108, 190},
+                        {  7,  82, 148}, {  3,  58,  98}, {  1,  25,  50},
+                    },
+                    {  // band 4
+                        {117, 207, 244}, { 44, 163, 233}, {  9, 112, 191},
+                        {  5,  84, 148}, {  3,  61,  87}, {  1,  28,  38},
+                    },
+                    {  // band 5
+                        {112, 214, 249}, { 39, 174, 240}, {  6, 125, 205},
+                        {  4,  96, 163}, {  5,  66, 100}, {  1, 128, 254},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {227,  70, 234}, {145,  91, 213}, { 61, 100, 173},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {108, 198, 243}, {171, 172, 240}, {118, 130, 210},
+                        {104, 107, 165}, { 64,  85, 114}, { 55,  64,  60},
+                    },
+                    {  // band 2
+                        {110, 208, 247}, { 64, 175, 237}, { 24, 112, 187},
+                        { 24,  81, 133}, { 24,  63,  83}, { 21,  47,  53},
+                    },
+                    {  // band 3
+                        { 91, 218, 249}, { 46, 188, 238}, {  8, 113, 184},
+                        {  5,  83, 137}, {  6,  62,  95}, { 17,  44,  94},
+                    },
+                    {  // band 4
+                        { 84, 216, 248}, { 30, 187, 237}, {  2, 117, 188},
+                        {  1,  88, 141}, {  3,  63,  98}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        {116, 218, 252}, { 47, 186, 242}, {  2, 132, 204},
+                        {  1, 106, 175}, {  1,  88, 104}, {  1, 254, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 35,  41, 129}, { 12,  30,  70}, {  2,  19,  32},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 30,  77, 116}, { 39,  70, 110}, { 20,  58,  96},
+                        {  8,  47,  77}, {  2,  33,  52}, {  1,  17,  26},
+                    },
+                    {  // band 2
+                        { 31, 123, 146}, { 18, 103, 140}, {  7,  81, 119},
+                        {  2,  62,  95}, {  1,  44,  70}, {  1,  26,  42},
+                    },
+                    {  // band 3
+                        { 21, 149, 170}, {  9, 114, 158}, {  2,  80, 126},
+                        {  1,  57,  94}, {  1,  36,  61}, {  1,  18,  31},
+                    },
+                    {  // band 4
+                        { 20, 178, 199}, {  6, 134, 183}, {  1,  87, 139},
+                        {  1,  60, 100}, {  1,  37,  64}, {  1,  18,  31},
+                    },
+                    {  // band 5
+                        { 36, 218, 233}, {  6, 160, 207}, {  1,  92, 147},
+                        {  1,  59, 101}, {  1,  35,  62}, {  1,  18,  31},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 17,  62, 211}, { 14,  62, 153}, {  5,  50,  84},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 11, 180, 205}, { 87, 160, 205}, { 53, 128, 184},
+                        { 27, 106, 156}, { 13,  79, 115}, {  6,  46,  67},
+                    },
+                    {  // band 2
+                        { 32, 194, 220}, { 20, 145, 202}, {  4,  96, 152},
+                        {  1,  67, 111}, {  1,  42,  70}, {  1,  21,  37},
+                    },
+                    {  // band 3
+                        { 30, 204, 228}, { 14, 152, 207}, {  1,  92, 149},
+                        {  1,  61, 103}, {  1,  34,  59}, {  1,  16,  28},
+                    },
+                    {  // band 4
+                        { 27, 213, 235}, {  7, 159, 210}, {  1,  88, 143},
+                        {  1,  55,  94}, {  1,  31,  53}, {  1,  16,  27},
+                    },
+                    {  // band 5
+                        { 28, 223, 243}, {  4, 173, 217}, {  1,  91, 146},
+                        {  1,  58,  98}, {  1,  35,  60}, {  1,  19,  33},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {172,  37, 202}, { 83,  51, 156}, { 24,  53, 110},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 76, 134, 206}, {110, 124, 200}, { 47, 106, 180},
+                        { 15,  82, 145}, {  3,  48,  83}, {  1,  19,  32},
+                    },
+                    {  // band 2
+                        { 80, 176, 220}, { 49, 145, 212}, { 17, 112, 180},
+                        {  7,  84, 140}, {  1,  53,  89}, {  1,  27,  43},
+                    },
+                    {  // band 3
+                        { 74, 201, 232}, { 38, 158, 221}, {  8, 112, 179},
+                        {  2,  79, 132}, {  1,  47,  82}, {  1,  26,  42},
+                    },
+                    {  // band 4
+                        { 73, 215, 239}, { 28, 169, 227}, {  3, 112, 176},
+                        {  1,  74, 126}, {  1,  48,  79}, {  1,  27,  44},
+                    },
+                    {  // band 5
+                        { 71, 233, 244}, { 18, 180, 230}, {  1, 114, 180},
+                        {  1,  80, 134}, {  1,  51,  85}, {  1,  26,  36},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {213,  34, 244}, {126,  57, 212}, { 46,  67, 151},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {120, 202, 245}, {198, 173, 241}, {119, 146, 224},
+                        { 76, 126, 195}, { 44, 102, 159}, { 40,  76, 115},
+                    },
+                    {  // band 2
+                        {120, 215, 248}, { 69, 171, 237}, { 23, 119, 194},
+                        { 10,  86, 147}, {  2,  56,  94}, {  1,  25,  44},
+                    },
+                    {  // band 3
+                        {102, 226, 250}, { 53, 183, 239}, {  9, 118, 188},
+                        {  2,  78, 131}, {  1,  48,  89}, {  1,  17,  36},
+                    },
+                    {  // band 4
+                        { 86, 235, 252}, { 34, 194, 240}, {  2, 109, 173},
+                        {  1,  68, 118}, {  1,  44,  79}, {  1,   1,  38},
+                    },
+                    {  // band 5
+                        { 59, 236, 243}, { 11, 189, 228}, {  1, 112, 187},
+                        {  1,  88, 145}, {  1,  55,  92}, {  1,   1, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 41,  40, 104}, { 12,  31,  64}, {  2,  16,  28},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 65,  58, 132}, { 50,  61, 130}, { 40,  57, 116},
+                        { 22,  46,  87}, {  2,  28,  44}, {  1,  11,  17},
+                    },
+                    {  // band 2
+                        { 55, 139, 135}, { 46, 122, 132}, { 21,  89, 110},
+                        {  6,  60,  78}, {  1,  38,  54}, {  1,  17,  27},
+                    },
+                    {  // band 3
+                        { 29, 167, 161}, { 10, 120, 141}, {  1,  69,  98},
+                        {  1,  42,  66}, {  1,  28,  44}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 19, 191, 180}, {  4, 125, 154}, {  1,  70, 107},
+                        {  1,  48,  77}, {  1,  33,  53}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 16, 238, 231}, {  2, 163, 198}, {  1,  85, 134},
+                        {  1,  54,  90}, {  1,  34,  57}, {  1,  17,  29},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 70,  15, 216}, { 40,  18, 164}, { 14,  17,  83},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 25, 150, 200}, {185, 154, 211}, {123, 137, 199},
+                        { 67, 119, 177}, { 31,  96, 137}, { 18,  63,  86},
+                    },
+                    {  // band 2
+                        { 57, 187, 223}, { 35, 148, 207}, {  7, 104, 159},
+                        {  2,  72, 113}, {  1,  44,  71}, {  1,  20,  34},
+                    },
+                    {  // band 3
+                        { 44, 203, 233}, { 18, 157, 212}, {  1,  98, 150},
+                        {  1,  61, 102}, {  1,  38,  62}, {  1,  19,  31},
+                    },
+                    {  // band 4
+                        { 41, 215, 238}, { 11, 166, 215}, {  1,  94, 146},
+                        {  1,  60, 101}, {  1,  37,  63}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 19, 236, 246}, {  3, 188, 223}, {  1,  95, 146},
+                        {  1,  58,  95}, {  1,  34,  56}, {  1,  17,  27},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {146,  27, 156}, { 49,  32, 116}, { 10,  39,  77},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 47, 101, 172}, { 93, 100, 178}, { 58,  91, 165},
+                        { 26,  75, 134}, {  4,  49,  82}, {  2,  22,  33},
+                    },
+                    {  // band 2
+                        { 60, 158, 196}, { 44, 135, 186}, { 25, 106, 157},
+                        {  8,  81, 124}, {  2,  56,  86}, {  1,  28,  45},
+                    },
+                    {  // band 3
+                        { 44, 169, 212}, { 15, 138, 196}, {  2, 100, 157},
+                        {  1,  74, 119}, {  1,  49,  76}, {  1,  20,  34},
+                    },
+                    {  // band 4
+                        { 38, 199, 231}, { 11, 158, 214}, {  1, 111, 167},
+                        {  1,  76, 122}, {  1,  44,  76}, {  1,  17,  39},
+                    },
+                    {  // band 5
+                        { 40, 236, 246}, { 10, 187, 230}, {  1, 115, 175},
+                        {  1,  74, 122}, {  1,  42,  71}, {  1,  14,  59},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 237}, { 65,  46, 209}, { 21,  46, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 87, 229, 245}, {206, 214, 244}, {148, 186, 236},
+                        { 89, 165, 221}, { 41, 132, 186}, { 37,  93, 141},
+                    },
+                    {  // band 2
+                        { 93, 231, 246}, { 47, 181, 231}, {  8, 117, 188},
+                        {  2,  84, 138}, {  1,  43,  87}, {  1,  27,  41},
+                    },
+                    {  // band 3
+                        { 80, 239, 250}, { 28, 190, 236}, {  1, 119, 183},
+                        {  1,  84, 135}, {  1,  81,  69}, {  1, 102,   1},
+                    },
+                    {  // band 4
+                        { 67, 245, 252}, { 22, 206, 242}, {  1, 130, 195},
+                        {  1,  77, 136}, {  1,  35,  88}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 43, 250, 228}, { 31, 185, 204}, {  6, 101, 183},
+                        {  1,  92, 151}, {  1,  84, 137}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+    },
+    {  // Q_Index 2
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {181,  22, 175}, { 96,  37, 147}, { 35,  41, 105},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80,  95, 197}, {111,  92, 193}, { 59,  87, 175},
+                        { 29,  79, 150}, { 10,  65, 118}, {  2,  47,  82},
+                    },
+                    {  // band 2
+                        { 90, 141, 216}, { 77, 120, 210}, { 23,  95, 184},
+                        { 11,  81, 151}, {  6,  75, 130}, {  2,  58, 113},
+                    },
+                    {  // band 3
+                        {122, 167, 231}, { 66, 119, 225}, { 26,  87, 189},
+                        {  7,  76, 151}, {  2,  63, 125}, {  1,  59,  77},
+                    },
+                    {  // band 4
+                        {162, 147, 244}, {110,  97, 236}, { 32,  88, 204},
+                        { 11,  89, 174}, {  5,  78, 151}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {205,  59, 251}, {176,  68, 248}, { 90,  71, 223},
+                        { 49,  72, 188}, { 17,  74, 203}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {188,  70, 207}, {140,  73, 189}, { 85,  73, 163},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 59, 144, 239}, { 79, 126, 237}, { 31, 102, 202},
+                        { 10,  81, 153}, {  3,  56, 102}, {  2,  33,  59},
+                    },
+                    {  // band 2
+                        {100, 152, 243}, { 80, 129, 236}, { 14,  94, 194},
+                        {  4,  72, 150}, {  1,  50, 103}, {  1,  35,  60},
+                    },
+                    {  // band 3
+                        {130, 183, 247}, { 70, 139, 242}, { 19, 100, 203},
+                        {  4,  83, 159}, {  1,  59, 119}, {  1,  44,  72},
+                    },
+                    {  // band 4
+                        {197, 138, 252}, {135, 107, 247}, { 31,  86, 210},
+                        {  7,  74, 160}, {  1,  53, 107}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {229,  54, 254}, {200,  51, 251}, { 83,  61, 226},
+                        { 33,  55, 177}, { 12,  74, 145}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {229,  20, 235}, {183,  37, 221}, {127,  47, 198},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {188, 115, 251}, {208, 110, 250}, {101,  99, 235},
+                        { 38,  81, 197}, {  9,  56, 132}, {  9,  52,  63},
+                    },
+                    {  // band 2
+                        {189, 150, 252}, {186, 137, 251}, { 54, 107, 236},
+                        { 14,  90, 195}, {  1,  89, 104}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {209, 180, 254}, {142, 145, 253}, { 51, 130, 236},
+                        {  6, 128, 214}, {  1, 128, 254}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 140, 254}, {194, 128, 254}, { 75, 119, 233},
+                        {128,  23, 230}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {244,  59, 254}, {239,  81, 254}, {128,  85, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {246,  55, 247}, {197,  64, 235}, {141,  74, 218},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {178, 163, 254}, {192, 138, 252}, { 85, 103, 231},
+                        { 49,  81, 179}, { 32,  54, 133}, { 12,  26,  98},
+                    },
+                    {  // band 2
+                        {189, 173, 254}, {179, 150, 253}, { 60,  94, 237},
+                        { 34,  81, 198}, { 20,  53, 187}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {202, 191, 254}, {157, 160, 254}, { 57, 117, 240},
+                        { 28, 105, 211}, {  1, 128,   1}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 146, 254}, {208, 133, 254}, { 66,  78, 233},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  49, 254}, {246,  63, 254}, { 85, 142, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 45,  28, 124}, { 23,  35, 107}, { 10,  34,  78},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 53,  99, 177}, { 82,  96, 174}, { 46,  89, 158},
+                        { 21,  76, 133}, {  6,  56,  94}, {  1,  33,  54},
+                    },
+                    {  // band 2
+                        { 68, 147, 201}, { 42, 124, 195}, { 17,  98, 166},
+                        {  7,  75, 131}, {  2,  53,  93}, {  1,  33,  59},
+                    },
+                    {  // band 3
+                        { 65, 176, 217}, { 30, 137, 206}, {  6,  97, 167},
+                        {  2,  70, 128}, {  1,  47,  88}, {  1,  29,  46},
+                    },
+                    {  // band 4
+                        { 69, 195, 232}, { 24, 146, 218}, {  4, 100, 175},
+                        {  2,  72, 134}, {  1,  51,  93}, {  1,  29,  52},
+                    },
+                    {  // band 5
+                        { 96, 212, 246}, { 39, 158, 234}, {  6, 109, 192},
+                        {  2,  77, 144}, {  1,  50,  95}, {  1,  20,  45},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 71,  80, 213}, { 53,  73, 181}, { 25,  66, 141},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35, 168, 231}, { 91, 150, 229}, { 49, 122, 202},
+                        { 22,  97, 162}, { 10,  68, 108}, {  9,  48,  57},
+                    },
+                    {  // band 2
+                        { 56, 178, 236}, { 32, 148, 225}, {  9,  99, 176},
+                        {  4,  69, 127}, {  2,  44,  78}, {  1,  25,  41},
+                    },
+                    {  // band 3
+                        { 57, 191, 242}, { 27, 155, 230}, {  5, 102, 180},
+                        {  2,  71, 133}, {  1,  44,  78}, {  1,  27,  41},
+                    },
+                    {  // band 4
+                        { 67, 201, 247}, { 24, 162, 237}, {  3, 106, 188},
+                        {  3,  74, 137}, {  1,  46,  85}, {  1,  34,  48},
+                    },
+                    {  // band 5
+                        {111, 210, 251}, { 47, 166, 244}, {  3, 113, 199},
+                        {  2,  77, 146}, {  1,  48,  93}, {  1,  38,  22},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {206,  21, 221}, {150,  36, 195}, { 94,  44, 164},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {147, 128, 239}, {194, 122, 238}, { 95, 104, 220},
+                        { 39,  81, 183}, { 13,  53, 111}, {  3,  24,  49},
+                    },
+                    {  // band 2
+                        {164, 163, 244}, {106, 142, 239}, { 50, 112, 215},
+                        { 26,  90, 177}, { 12,  67, 130}, {  1,   1,  64},
+                    },
+                    {  // band 3
+                        {155, 193, 249}, { 88, 158, 244}, { 26, 124, 220},
+                        { 10,  98, 173}, {  1,  77, 126}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {141, 205, 252}, { 64, 174, 248}, { 17, 124, 221},
+                        { 12,  92, 176}, {  1,  29, 148}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {150, 217, 254}, { 74, 191, 252}, { 30, 144, 215},
+                        {  1, 106, 137}, {128,   1, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {241,  37, 242}, {175,  48, 223}, { 99,  53, 189},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {153, 183, 248}, {212, 156, 247}, {134, 124, 221},
+                        { 88, 103, 184}, { 59,  86, 132}, { 29,  61,  67},
+                    },
+                    {  // band 2
+                        {162, 199, 250}, {106, 167, 247}, { 56, 110, 207},
+                        { 32,  85, 165}, { 16,  71, 130}, {  1,  93, 254},
+                    },
+                    {  // band 3
+                        {143, 213, 252}, { 86, 187, 250}, { 23, 124, 220},
+                        {  7,  95, 176}, {  1, 109, 102}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {130, 219, 254}, { 70, 201, 253}, { 15, 128, 215},
+                        {  1, 101, 201}, {  1,  64, 170}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {155, 219, 254}, {105, 207, 254}, { 28, 155, 229},
+                        {  1, 153, 191}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 18,  26, 117}, { 10,  29,  82}, {  3,  25,  52},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35,  88, 152}, { 62,  85, 150}, { 36,  77, 137},
+                        { 16,  66, 116}, {  4,  47,  81}, {  1,  26,  44},
+                    },
+                    {  // band 2
+                        { 55, 141, 182}, { 32, 119, 177}, { 12,  93, 154},
+                        {  4,  71, 123}, {  1,  51,  89}, {  1,  32,  56},
+                    },
+                    {  // band 3
+                        { 46, 171, 202}, { 21, 130, 191}, {  5,  91, 154},
+                        {  1,  64, 115}, {  1,  42,  77}, {  1,  25,  41},
+                    },
+                    {  // band 4
+                        { 43, 195, 219}, { 12, 142, 203}, {  1,  91, 156},
+                        {  1,  63, 115}, {  1,  41,  77}, {  1,  22,  43},
+                    },
+                    {  // band 5
+                        { 42, 221, 238}, {  8, 162, 219}, {  1,  98, 167},
+                        {  1,  67, 123}, {  1,  43,  83}, {  1,  25,  38},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 16,  51, 216}, { 20,  48, 168}, {  9,  44, 109},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 34, 164, 226}, {124, 148, 226}, { 72, 127, 207},
+                        { 36, 107, 175}, { 15,  81, 129}, {  6,  51,  79},
+                    },
+                    {  // band 2
+                        { 61, 182, 234}, { 35, 148, 220}, {  9, 101, 178},
+                        {  4,  71, 134}, {  1,  46,  90}, {  1,  24,  51},
+                    },
+                    {  // band 3
+                        { 54, 198, 239}, { 25, 156, 224}, {  3,  98, 173},
+                        {  1,  66, 124}, {  1,  41,  78}, {  1,  15,  37},
+                    },
+                    {  // band 4
+                        { 48, 209, 242}, { 12, 162, 226}, {  1,  96, 169},
+                        {  1,  63, 119}, {  1,  40,  78}, {  1,  18,  45},
+                    },
+                    {  // band 5
+                        { 44, 223, 247}, {  6, 173, 232}, {  1, 105, 178},
+                        {  1,  71, 131}, {  1,  44,  84}, {  1,  13,  46},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {188,  26, 214}, {121,  42, 181}, { 66,  49, 149},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {136, 128, 233}, {172, 124, 230}, { 80, 106, 211},
+                        { 27,  81, 174}, {  6,  49,  98}, {  8,  28,  49},
+                    },
+                    {  // band 2
+                        {145, 166, 239}, { 92, 141, 229}, { 28, 108, 196},
+                        {  8,  87, 154}, {  1,  58, 105}, {  1,  27,  59},
+                    },
+                    {  // band 3
+                        {131, 193, 242}, { 66, 151, 231}, { 13, 112, 192},
+                        {  2,  81, 152}, {  1,  66, 121}, {  1,  23,  64},
+                    },
+                    {  // band 4
+                        {112, 211, 246}, { 41, 164, 235}, {  5, 117, 202},
+                        {  1,  83, 162}, {  1,  64, 111}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 96, 230, 250}, { 28, 185, 243}, {  2, 132, 204},
+                        {  1,  91, 166}, {  1,  85,  46}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {238,  23, 242}, {157,  29, 215}, { 73,  27, 162},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {165, 173, 250}, {222, 151, 247}, {152, 134, 235},
+                        {114, 120, 210}, { 86, 109, 176}, { 53,  88, 145},
+                    },
+                    {  // band 2
+                        {164, 194, 249}, {100, 158, 241}, { 35, 111, 212},
+                        { 17,  85, 167}, {  1,  52, 112}, {  1,  73,   1},
+                    },
+                    {  // band 3
+                        {151, 215, 252}, { 83, 172, 245}, { 16, 122, 208},
+                        {  6, 101, 165}, {  1,  74, 113}, {  1,   1,   1},
+                    },
+                    {  // band 4
+                        {138, 230, 253}, { 65, 184, 248}, {  8, 128, 212},
+                        {  1, 111, 182}, {128,   1,   1}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {123, 240, 253}, { 36, 201, 250}, {  3, 127, 211},
+                        {  1,  68, 204}, {128,   1,   1}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 51,  21, 156}, { 30,  23,  86}, {  4,  18,  37},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 38,  77, 129}, { 79,  76, 129}, { 40,  66, 117},
+                        { 12,  54,  95}, {  1,  36,  60}, {  1,  17,  29},
+                    },
+                    {  // band 2
+                        { 44, 133, 149}, { 24, 107, 143}, {  8,  78, 121},
+                        {  3,  59,  97}, {  1,  42,  71}, {  1,  22,  37},
+                    },
+                    {  // band 3
+                        { 29, 160, 171}, {  9, 114, 158}, {  1,  76, 125},
+                        {  1,  54,  93}, {  1,  36,  63}, {  1,  20,  35},
+                    },
+                    {  // band 4
+                        { 22, 188, 205}, {  6, 132, 186}, {  1,  87, 144},
+                        {  1,  62, 107}, {  1,  41,  72}, {  1,  23,  41},
+                    },
+                    {  // band 5
+                        { 25, 233, 236}, {  5, 165, 214}, {  1,  96, 158},
+                        {  1,  63, 112}, {  1,  40,  73}, {  1,  23,  40},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 48,  20, 231}, { 37,  21, 179}, { 15,  18, 109},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 41, 154, 216}, {196, 142, 221}, {131, 125, 207},
+                        { 84, 111, 181}, { 45,  91, 142}, { 27,  62,  89},
+                    },
+                    {  // band 2
+                        { 72, 181, 230}, { 41, 147, 215}, { 10, 102, 173},
+                        {  3,  73, 132}, {  1,  47,  89}, {  1,  23,  50},
+                    },
+                    {  // band 3
+                        { 60, 201, 236}, { 23, 157, 219}, {  2,  99, 167},
+                        {  1,  69, 124}, {  1,  43,  80}, {  1,  22,  39},
+                    },
+                    {  // band 4
+                        { 53, 214, 242}, { 15, 165, 224}, {  1, 101, 173},
+                        {  1,  70, 131}, {  1,  44,  83}, {  1,  23,  49},
+                    },
+                    {  // band 5
+                        { 39, 239, 248}, {  7, 186, 233}, {  1, 108, 174},
+                        {  1,  70, 123}, {  1,  43,  77}, {  1,  16,  42},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 204}, { 77,  40, 160}, { 26,  50, 117},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80, 140, 218}, {136, 133, 215}, { 63, 117, 197},
+                        { 20,  93, 170}, {  7,  55, 102}, { 13,  32,  52},
+                    },
+                    {  // band 2
+                        { 86, 173, 231}, { 46, 150, 220}, { 18, 118, 190},
+                        {  8,  90, 150}, {  2,  60,  95}, {  1,  39,  41},
+                    },
+                    {  // band 3
+                        { 80, 183, 242}, { 37, 160, 231}, {  6, 120, 182},
+                        {  1,  86, 137}, {  1,  46,  78}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 88, 215, 247}, { 42, 179, 235}, {  4, 116, 182},
+                        {  2,  80, 133}, {  1,  46,  85}, {  1,  64,  43},
+                    },
+                    {  // band 5
+                        {100, 236, 250}, { 31, 186, 234}, {  1, 114, 181},
+                        {  1,  85, 135}, {  1,  78,  64}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {213,  13, 245}, {106,  16, 211}, { 32,  11, 156},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {140, 214, 247}, {241, 186, 243}, {177, 172, 235},
+                        {128, 156, 219}, {106, 130, 191}, { 99, 105, 152},
+                    },
+                    {  // band 2
+                        {125, 218, 248}, { 75, 167, 239}, { 29, 111, 212},
+                        {  6,  66, 152}, {  1,  42,  96}, {  1,  85, 128},
+                    },
+                    {  // band 3
+                        {120, 232, 252}, { 60, 189, 247}, {  8, 141, 200},
+                        {  1,  89, 134}, {  1,  32, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {111, 238, 253}, { 56, 198, 245}, {  1, 123, 208},
+                        {  1,  93, 176}, {  1,   1,  73}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 98, 251, 249}, { 56, 189, 244}, { 17, 113, 220},
+                        {  1, 109, 179}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+    },
+    {  // Q_Index 3
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {186,  16, 200}, {122,  31, 187}, { 78,  40, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {146, 119, 245}, {182, 115, 244}, {130, 113, 238},
+                        { 88, 110, 225}, { 47, 103, 208}, {  5, 102, 188},
+                    },
+                    {  // band 2
+                        {164, 157, 248}, {155, 141, 250}, { 71, 116, 243},
+                        { 88, 129, 233}, { 50,  99, 228}, { 26, 148, 191},
+                    },
+                    {  // band 3
+                        {200, 158, 253}, {177, 118, 252}, { 99, 113, 245},
+                        { 77, 120, 210}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 104, 254}, {209,  82, 254}, {143, 112, 252},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {250,  36, 254}, {243,  55, 254}, {223, 170, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {207,  37, 226}, {164,  46, 218}, {122,  58, 201},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {149, 154, 253}, {170, 137, 253}, { 94, 123, 247},
+                        { 42, 113, 222}, { 16,  97, 174}, { 49,  98, 159},
+                    },
+                    {  // band 2
+                        {177, 162, 253}, {165, 142, 252}, { 51, 108, 243},
+                        { 18, 108, 213}, {  1,  98, 254}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {211, 152, 254}, {184, 116, 254}, { 70, 110, 244},
+                        {  8, 108, 237}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {236,  89, 254}, {210,  67, 254}, {112, 111, 248},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  26, 254}, {233,  35, 254}, {128,   1, 254},
+                        {254, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {247,   2, 247}, {226,   8, 242}, {191,  14, 235},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {231,  94, 254}, {248,  91, 254}, {186,  89, 252},
+                        {128,  92, 244}, { 79, 112, 254}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {228, 145, 253}, {240, 130, 254}, {223, 105, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {245, 153, 253}, {240, 120, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 128, 254}, {204, 128, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {253,   7, 249}, {224,   9, 244}, {182,  13, 231},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {234, 109, 254}, {242, 104, 254}, {160,  98, 254},
+                        {123,  85, 243}, { 82,  43, 217}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {243, 137, 254}, {240, 118, 254}, {136,  53, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {251, 173, 254}, {229, 129, 250}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 119, 254}, {254, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 49,  26, 159}, { 36,  34, 150}, { 26,  38, 124},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 99, 122, 226}, {143, 119, 225}, { 90, 113, 213},
+                        { 46, 102, 193}, { 14,  84, 157}, {  3,  59, 107},
+                    },
+                    {  // band 2
+                        {109, 164, 237}, { 74, 142, 233}, { 29, 112, 216},
+                        { 14,  92, 184}, { 10,  80, 156}, {  1,  52, 137},
+                    },
+                    {  // band 3
+                        {110, 191, 245}, { 59, 156, 240}, { 18, 121, 220},
+                        {  8,  97, 184}, {  3,  84, 150}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {115, 203, 250}, { 59, 167, 246}, { 16, 130, 226},
+                        {  7,  97, 192}, {  1,  71,  99}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {149, 218, 253}, { 93, 171, 251}, { 28, 125, 233},
+                        { 28,  99, 192}, {128,  85,  85}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 97,  45, 229}, { 79,  52, 205}, { 46,  58, 171},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 99, 180, 249}, {156, 165, 249}, { 73, 141, 237},
+                        { 31, 116, 208}, { 13,  81, 153}, {  5,  42,  86},
+                    },
+                    {  // band 2
+                        {113, 188, 251}, { 68, 161, 244}, { 16, 108, 216},
+                        {  6,  81, 168}, {  2,  65, 118}, {128,   1,   1},
+                    },
+                    {  // band 3
+                        {117, 201, 252}, { 62, 171, 248}, { 12, 119, 221},
+                        {  5,  90, 182}, {  4,  66, 116}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {128, 207, 253}, { 70, 176, 251}, { 11, 126, 228},
+                        {  6,  89, 189}, {  1,  44, 148}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {162, 218, 254}, {107, 170, 253}, { 22, 131, 238},
+                        {  1,  77, 182}, {  1, 254, 128}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {235,   5, 238}, {194,  14, 223}, {152,  22, 205},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {200, 121, 251}, {241, 115, 252}, {167, 108, 248},
+                        { 93,  93, 233}, { 36,  66, 189}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {220, 151, 253}, {176, 135, 252}, { 95, 124, 254},
+                        { 64, 105, 217}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {225, 189, 254}, {175, 155, 254}, {102, 119, 254},
+                        {  1,   1,   1}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {218, 195, 254}, {125, 157, 253}, {128, 128, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {221, 197, 254}, { 85, 210, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {250,   9, 246}, {204,  13, 234}, {144,  18, 211},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {213, 157, 253}, {243, 138, 253}, {170, 117, 250},
+                        {109,  91, 233}, { 66,  77, 163}, { 64,  85, 254},
+                    },
+                    {  // band 2
+                        {221, 169, 254}, {182, 141, 253}, {112, 120, 239},
+                        { 85, 165, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {226, 192, 254}, {189, 174, 251}, {153, 128, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {232, 192, 254}, {195, 187, 247}, {  1, 191, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {247, 185, 254}, {254,  93, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 14,  30, 136}, { 15,  33, 120}, { 10,  33,  90},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 92, 109, 209}, {113, 108, 207}, { 77, 102, 193},
+                        { 39,  91, 171}, { 11,  70, 129}, {  2,  44,  77},
+                    },
+                    {  // band 2
+                        { 99, 158, 223}, { 66, 135, 217}, { 23, 109, 194},
+                        {  9,  85, 160}, {  3,  66, 124}, {  1,  51, 100},
+                    },
+                    {  // band 3
+                        { 89, 189, 234}, { 46, 149, 225}, { 10, 110, 194},
+                        {  2,  83, 156}, {  1,  57, 113}, {  1,  47,  73},
+                    },
+                    {  // band 4
+                        { 78, 206, 242}, { 28, 161, 232}, {  3, 114, 200},
+                        {  1,  86, 161}, {  1,  62, 118}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        { 72, 227, 250}, { 20, 182, 242}, {  3, 126, 210},
+                        {  2,  91, 166}, {  1,  64, 126}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 23,  42, 227}, { 41,  43, 195}, { 25,  45, 146},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {100, 172, 245}, {165, 158, 246}, { 88, 137, 234},
+                        { 44, 116, 203}, { 18,  85, 149}, {  7,  56,  92},
+                    },
+                    {  // band 2
+                        {117, 188, 247}, { 70, 155, 239}, { 18, 105, 204},
+                        {  7,  78, 158}, {  2,  50, 111}, {  1,  38,  77},
+                    },
+                    {  // band 3
+                        {104, 207, 250}, { 54, 166, 241}, {  6, 110, 199},
+                        {  1,  78, 155}, {  1,  45, 100}, {  1,   1,   1},
+                    },
+                    {  // band 4
+                        { 87, 216, 251}, { 30, 177, 243}, {  1, 114, 203},
+                        {  1,  85, 157}, {  1,  53, 108}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 80, 230, 253}, { 23, 193, 248}, {  1, 127, 215},
+                        {  1,  94, 170}, {  1,  71,  59}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {222,   9, 234}, {161,  20, 210}, {113,  30, 185},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {195, 120, 248}, {231, 124, 247}, {148, 116, 238},
+                        { 64,  98, 207}, { 20,  70, 147}, { 87,  68, 100},
+                    },
+                    {  // band 2
+                        {186, 161, 250}, {124, 148, 245}, { 44, 123, 230},
+                        { 23, 107, 205}, {  1,  80, 131}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {172, 196, 252}, {110, 160, 248}, { 37, 134, 235},
+                        { 23, 125, 200}, {128, 254, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {173, 209, 253}, {103, 175, 250}, {  1, 120, 240},
+                        {  1, 146, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {184, 235, 254}, { 81, 186, 251}, {128, 109, 254},
+                        {128, 254, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {248,   8, 243}, {185,  11, 225}, {108,  11, 189},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {208, 158, 254}, {244, 147, 252}, {195, 132, 248},
+                        {161, 122, 224}, {129, 114, 188}, { 59, 119, 159},
+                    },
+                    {  // band 2
+                        {202, 182, 253}, {143, 161, 251}, { 73, 115, 247},
+                        {146, 175, 204}, {128,   1, 254}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {202, 204, 254}, {131, 174, 251}, { 18, 153, 207},
+                        {128, 254, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {192, 221, 254}, {114, 190, 254}, {128, 170, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {166, 236, 254}, {119, 200, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 30,  32, 144}, { 21,  35,  96}, {  4,  27,  55},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35, 107, 172}, { 61, 104, 170}, { 33,  94, 160},
+                        { 13,  80, 139}, {  2,  55,  97}, {  1,  28,  49},
+                    },
+                    {  // band 2
+                        { 51, 153, 195}, { 29, 129, 189}, {  9,  99, 163},
+                        {  3,  75, 129}, {  1,  49,  88}, {  1,  29,  50},
+                    },
+                    {  // band 3
+                        { 53, 164, 210}, { 21, 134, 201}, {  3,  97, 164},
+                        {  1,  69, 124}, {  1,  45,  82}, {  1,  31,  58},
+                    },
+                    {  // band 4
+                        { 47, 205, 234}, { 18, 158, 220}, {  2, 109, 177},
+                        {  1,  78, 137}, {  1,  53, 101}, {  1,  34,  70},
+                    },
+                    {  // band 5
+                        { 55, 233, 245}, { 16, 179, 233}, {  1, 116, 191},
+                        {  1,  79, 145}, {  1,  53, 101}, {  1,  37,  58},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 36,  33, 227}, { 39,  28, 190}, { 18,  27, 134},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 76, 156, 235}, {184, 147, 235}, {114, 130, 220},
+                        { 72, 112, 191}, { 42,  87, 144}, { 21,  65,  93},
+                    },
+                    {  // band 2
+                        { 96, 179, 240}, { 51, 149, 228}, { 12, 105, 191},
+                        {  6,  74, 148}, {  1,  47, 100}, {  1,  29,  53},
+                    },
+                    {  // band 3
+                        { 88, 191, 242}, { 35, 154, 231}, {  3, 106, 187},
+                        {  1,  74, 140}, {  1,  41,  84}, {  1,  25,  38},
+                    },
+                    {  // band 4
+                        { 77, 212, 249}, { 28, 171, 239}, {  2, 117, 199},
+                        {  1,  79, 151}, {  1,  45,  99}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        { 77, 236, 252}, { 27, 190, 246}, {  2, 120, 203},
+                        {  1,  78, 147}, {  1,  42,  72}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {185,  11, 227}, {113,  30, 182}, { 57,  44, 144},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {151, 139, 244}, {212, 139, 241}, {124, 126, 231},
+                        { 59, 104, 213}, { 26,  73, 158}, { 20,  45,  95},
+                    },
+                    {  // band 2
+                        {155, 163, 247}, {108, 152, 239}, { 39, 124, 214},
+                        {  7, 109, 162}, { 29,  57, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {158, 176, 250}, { 89, 164, 243}, { 11, 114, 196},
+                        {  1,  96, 141}, {  1,  81, 118}, {128,   1,   1},
+                    },
+                    {  // band 4
+                        {148, 212, 251}, { 59, 174, 240}, {  2, 130, 203},
+                        {  1,  70, 168}, {  1,  51, 106}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {104, 237, 252}, { 39, 190, 246}, {  1, 154, 220},
+                        {128, 102,   1}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {236,   6, 242}, {111,   6, 206}, { 36,   5, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {193, 193, 252}, {248, 182, 251}, {218, 150, 246},
+                        {182, 134, 244}, {151, 137, 227}, { 45, 102, 195},
+                    },
+                    {  // band 2
+                        {188, 202, 251}, {125, 165, 249}, { 64,  75, 218},
+                        {  1, 128, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {178, 225, 254}, {107, 188, 231}, { 21, 135, 233},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {164, 227, 253}, { 55, 193, 251}, {  1, 111, 225},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {151, 243, 254}, { 50, 203, 254}, {128, 179, 254},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+    },
+};
+#else
 static const av1_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { 195, 29, 183 },
-        { 84, 49, 136 },
-        { 8, 42, 71 } },
-      { // Band 1
-        { 31, 107, 169 },
-        { 35, 99, 159 },
-        { 17, 82, 140 },
-        { 8, 66, 114 },
-        { 2, 44, 76 },
-        { 1, 19, 32 } },
-      { // Band 2
-        { 40, 132, 201 },
-        { 29, 114, 187 },
-        { 13, 91, 157 },
-        { 7, 75, 127 },
-        { 3, 58, 95 },
-        { 1, 28, 47 } },
-      { // Band 3
-        { 69, 142, 221 },
-        { 42, 122, 201 },
-        { 15, 91, 159 },
-        { 6, 67, 121 },
-        { 1, 42, 77 },
-        { 1, 17, 31 } },
-      { // Band 4
-        { 102, 148, 228 },
-        { 67, 117, 204 },
-        { 17, 82, 154 },
-        { 6, 59, 114 },
-        { 2, 39, 75 },
-        { 1, 15, 29 } },
-      { // Band 5
-        { 156, 57, 233 },
-        { 119, 57, 212 },
-        { 58, 48, 163 },
-        { 29, 40, 124 },
-        { 12, 30, 81 },
-        { 3, 12, 31 } } },
-    {   // Inter
-      { // Band 0
-        { 191, 107, 226 },
-        { 124, 117, 204 },
-        { 25, 99, 155 } },
-      { // Band 1
-        { 29, 148, 210 },
-        { 37, 126, 194 },
-        { 8, 93, 157 },
-        { 2, 68, 118 },
-        { 1, 39, 69 },
-        { 1, 17, 33 } },
-      { // Band 2
-        { 41, 151, 213 },
-        { 27, 123, 193 },
-        { 3, 82, 144 },
-        { 1, 58, 105 },
-        { 1, 32, 60 },
-        { 1, 13, 26 } },
-      { // Band 3
-        { 59, 159, 220 },
-        { 23, 126, 198 },
-        { 4, 88, 151 },
-        { 1, 66, 114 },
-        { 1, 38, 71 },
-        { 1, 18, 34 } },
-      { // Band 4
-        { 114, 136, 232 },
-        { 51, 114, 207 },
-        { 11, 83, 155 },
-        { 3, 56, 105 },
-        { 1, 33, 65 },
-        { 1, 17, 34 } },
-      { // Band 5
-        { 149, 65, 234 },
-        { 121, 57, 215 },
-        { 61, 49, 166 },
-        { 28, 36, 114 },
-        { 12, 25, 76 },
-        { 3, 16, 42 } } } },
-  {     // UV plane
-    {   // Intra
-      { // Band 0
-        { 214, 49, 220 },
-        { 132, 63, 188 },
-        { 42, 65, 137 } },
-      { // Band 1
-        { 85, 137, 221 },
-        { 104, 131, 216 },
-        { 49, 111, 192 },
-        { 21, 87, 155 },
-        { 2, 49, 87 },
-        { 1, 16, 28 } },
-      { // Band 2
-        { 89, 163, 230 },
-        { 90, 137, 220 },
-        { 29, 100, 183 },
-        { 10, 70, 135 },
-        { 2, 42, 81 },
-        { 1, 17, 33 } },
-      { // Band 3
-        { 108, 167, 237 },
-        { 55, 133, 222 },
-        { 15, 97, 179 },
-        { 4, 72, 135 },
-        { 1, 45, 85 },
-        { 1, 19, 38 } },
-      { // Band 4
-        { 124, 146, 240 },
-        { 66, 124, 224 },
-        { 17, 88, 175 },
-        { 4, 58, 122 },
-        { 1, 36, 75 },
-        { 1, 18, 37 } },
-      { //  Band 5
-        { 141, 79, 241 },
-        { 126, 70, 227 },
-        { 66, 58, 182 },
-        { 30, 44, 136 },
-        { 12, 34, 96 },
-        { 2, 20, 47 } } },
-    {   // Inter
-      { // Band 0
-        { 229, 99, 249 },
-        { 143, 111, 235 },
-        { 46, 109, 192 } },
-      { // Band 1
-        { 82, 158, 236 },
-        { 94, 146, 224 },
-        { 25, 117, 191 },
-        { 9, 87, 149 },
-        { 3, 56, 99 },
-        { 1, 33, 57 } },
-      { // Band 2
-        { 83, 167, 237 },
-        { 68, 145, 222 },
-        { 10, 103, 177 },
-        { 2, 72, 131 },
-        { 1, 41, 79 },
-        { 1, 20, 39 } },
-      { // Band 3
-        { 99, 167, 239 },
-        { 47, 141, 224 },
-        { 10, 104, 178 },
-        { 2, 73, 133 },
-        { 1, 44, 85 },
-        { 1, 22, 47 } },
-      { // Band 4
-        { 127, 145, 243 },
-        { 71, 129, 228 },
-        { 17, 93, 177 },
-        { 3, 61, 124 },
-        { 1, 41, 84 },
-        { 1, 21, 52 } },
-      { // Band 5
-        { 157, 78, 244 },
-        { 140, 72, 231 },
-        { 69, 58, 184 },
-        { 31, 44, 137 },
-        { 14, 38, 105 },
-        { 8, 23, 61 } } } }
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 195,  29, 183 }, {  84,  49, 136 }, {   8,  42,  71 }
+      }, {  // Band 1
+        {  31, 107, 169 }, {  35,  99, 159 }, {  17,  82, 140 },
+        {   8,  66, 114 }, {   2,  44,  76 }, {   1,  19,  32 }
+      }, {  // Band 2
+        {  40, 132, 201 }, {  29, 114, 187 }, {  13,  91, 157 },
+        {   7,  75, 127 }, {   3,  58,  95 }, {   1,  28,  47 }
+      }, {  // Band 3
+        {  69, 142, 221 }, {  42, 122, 201 }, {  15,  91, 159 },
+        {   6,  67, 121 }, {   1,  42,  77 }, {   1,  17,  31 }
+      }, {  // Band 4
+        { 102, 148, 228 }, {  67, 117, 204 }, {  17,  82, 154 },
+        {   6,  59, 114 }, {   2,  39,  75 }, {   1,  15,  29 }
+      }, {  // Band 5
+        { 156,  57, 233 }, { 119,  57, 212 }, {  58,  48, 163 },
+        {  29,  40, 124 }, {  12,  30,  81 }, {   3,  12,  31 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 191, 107, 226 }, { 124, 117, 204 }, {  25,  99, 155 }
+      }, {  // Band 1
+        {  29, 148, 210 }, {  37, 126, 194 }, {   8,  93, 157 },
+        {   2,  68, 118 }, {   1,  39,  69 }, {   1,  17,  33 }
+      }, {  // Band 2
+        {  41, 151, 213 }, {  27, 123, 193 }, {   3,  82, 144 },
+        {   1,  58, 105 }, {   1,  32,  60 }, {   1,  13,  26 }
+      }, {  // Band 3
+        {  59, 159, 220 }, {  23, 126, 198 }, {   4,  88, 151 },
+        {   1,  66, 114 }, {   1,  38,  71 }, {   1,  18,  34 }
+      }, {  // Band 4
+        { 114, 136, 232 }, {  51, 114, 207 }, {  11,  83, 155 },
+        {   3,  56, 105 }, {   1,  33,  65 }, {   1,  17,  34 }
+      }, {  // Band 5
+        { 149,  65, 234 }, { 121,  57, 215 }, {  61,  49, 166 },
+        {  28,  36, 114 }, {  12,  25,  76 }, {   3,  16,  42 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 214,  49, 220 }, { 132,  63, 188 }, {  42,  65, 137 }
+      }, {  // Band 1
+        {  85, 137, 221 }, { 104, 131, 216 }, {  49, 111, 192 },
+        {  21,  87, 155 }, {   2,  49,  87 }, {   1,  16,  28 }
+      }, {  // Band 2
+        {  89, 163, 230 }, {  90, 137, 220 }, {  29, 100, 183 },
+        {  10,  70, 135 }, {   2,  42,  81 }, {   1,  17,  33 }
+      }, {  // Band 3
+        { 108, 167, 237 }, {  55, 133, 222 }, {  15,  97, 179 },
+        {   4,  72, 135 }, {   1,  45,  85 }, {   1,  19,  38 }
+      }, {  // Band 4
+        { 124, 146, 240 }, {  66, 124, 224 }, {  17,  88, 175 },
+        {   4,  58, 122 }, {   1,  36,  75 }, {   1,  18,  37 }
+      }, {  //  Band 5
+        { 141,  79, 241 }, { 126,  70, 227 }, {  66,  58, 182 },
+        {  30,  44, 136 }, {  12,  34,  96 }, {   2,  20,  47 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 229,  99, 249 }, { 143, 111, 235 }, {  46, 109, 192 }
+      }, {  // Band 1
+        {  82, 158, 236 }, {  94, 146, 224 }, {  25, 117, 191 },
+        {   9,  87, 149 }, {   3,  56,  99 }, {   1,  33,  57 }
+      }, {  // Band 2
+        {  83, 167, 237 }, {  68, 145, 222 }, {  10, 103, 177 },
+        {   2,  72, 131 }, {   1,  41,  79 }, {   1,  20,  39 }
+      }, {  // Band 3
+        {  99, 167, 239 }, {  47, 141, 224 }, {  10, 104, 178 },
+        {   2,  73, 133 }, {   1,  44,  85 }, {   1,  22,  47 }
+      }, {  // Band 4
+        { 127, 145, 243 }, {  71, 129, 228 }, {  17,  93, 177 },
+        {   3,  61, 124 }, {   1,  41,  84 }, {   1,  21,  52 }
+      }, {  // Band 5
+        { 157,  78, 244 }, { 140,  72, 231 }, {  69,  58, 184 },
+        {  31,  44, 137 }, {  14,  38, 105 }, {   8,  23,  61 }
+      }
+    }
+  }
 };
 
 static const av1_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { 125, 34, 187 },
-        { 52, 41, 133 },
-        { 6, 31, 56 } },
-      { // Band 1
-        { 37, 109, 153 },
-        { 51, 102, 147 },
-        { 23, 87, 128 },
-        { 8, 67, 101 },
-        { 1, 41, 63 },
-        { 1, 19, 29 } },
-      { // Band 2
-        { 31, 154, 185 },
-        { 17, 127, 175 },
-        { 6, 96, 145 },
-        { 2, 73, 114 },
-        { 1, 51, 82 },
-        { 1, 28, 45 } },
-      { // Band 3
-        { 23, 163, 200 },
-        { 10, 131, 185 },
-        { 2, 93, 148 },
-        { 1, 67, 111 },
-        { 1, 41, 69 },
-        { 1, 14, 24 } },
-      { // Band 4
-        { 29, 176, 217 },
-        { 12, 145, 201 },
-        { 3, 101, 156 },
-        { 1, 69, 111 },
-        { 1, 39, 63 },
-        { 1, 14, 23 } },
-      { // Band 5
-        { 57, 192, 233 },
-        { 25, 154, 215 },
-        { 6, 109, 167 },
-        { 3, 78, 118 },
-        { 1, 48, 69 },
-        { 1, 21, 29 } } },
-    {   // Inter
-      { // Band 0
-        { 202, 105, 245 },
-        { 108, 106, 216 },
-        { 18, 90, 144 } },
-      { // Band 1
-        { 33, 172, 219 },
-        { 64, 149, 206 },
-        { 14, 117, 177 },
-        { 5, 90, 141 },
-        { 2, 61, 95 },
-        { 1, 37, 57 } },
-      { // Band 2
-        { 33, 179, 220 },
-        { 11, 140, 198 },
-        { 1, 89, 148 },
-        { 1, 60, 104 },
-        { 1, 33, 57 },
-        { 1, 12, 21 } },
-      { // Band 3
-        { 30, 181, 221 },
-        { 8, 141, 198 },
-        { 1, 87, 145 },
-        { 1, 58, 100 },
-        { 1, 31, 55 },
-        { 1, 12, 20 } },
-      { // Band 4
-        { 32, 186, 224 },
-        { 7, 142, 198 },
-        { 1, 86, 143 },
-        { 1, 58, 100 },
-        { 1, 31, 55 },
-        { 1, 12, 22 } },
-      { // Band 5
-        { 57, 192, 227 },
-        { 20, 143, 204 },
-        { 3, 96, 154 },
-        { 1, 68, 112 },
-        { 1, 42, 69 },
-        { 1, 19, 32 } } } },
-  {     // UV plane
-    {   // Intra
-      { // Band 0
-        { 212, 35, 215 },
-        { 113, 47, 169 },
-        { 29, 48, 105 } },
-      { // Band 1
-        { 74, 129, 203 },
-        { 106, 120, 203 },
-        { 49, 107, 178 },
-        { 19, 84, 144 },
-        { 4, 50, 84 },
-        { 1, 15, 25 } },
-      { // Band 2
-        { 71, 172, 217 },
-        { 44, 141, 209 },
-        { 15, 102, 173 },
-        { 6, 76, 133 },
-        { 2, 51, 89 },
-        { 1, 24, 42 } },
-      { // Band 3
-        { 64, 185, 231 },
-        { 31, 148, 216 },
-        { 8, 103, 175 },
-        { 3, 74, 131 },
-        { 1, 46, 81 },
-        { 1, 18, 30 } },
-      { // Band 4
-        { 65, 196, 235 },
-        { 25, 157, 221 },
-        { 5, 105, 174 },
-        { 1, 67, 120 },
-        { 1, 38, 69 },
-        { 1, 15, 30 } },
-      { // Band 5
-        { 65, 204, 238 },
-        { 30, 156, 224 },
-        { 7, 107, 177 },
-        { 2, 70, 124 },
-        { 1, 42, 73 },
-        { 1, 18, 34 } } },
-    {   // Inter
-      { // Band 0
-        { 225, 86, 251 },
-        { 144, 104, 235 },
-        { 42, 99, 181 } },
-      { // Band 1
-        { 85, 175, 239 },
-        { 112, 165, 229 },
-        { 29, 136, 200 },
-        { 12, 103, 162 },
-        { 6, 77, 123 },
-        { 2, 53, 84 } },
-      { // Band 2
-        { 75, 183, 239 },
-        { 30, 155, 221 },
-        { 3, 106, 171 },
-        { 1, 74, 128 },
-        { 1, 44, 76 },
-        { 1, 17, 28 } },
-      { // Band 3
-        { 73, 185, 240 },
-        { 27, 159, 222 },
-        { 2, 107, 172 },
-        { 1, 75, 127 },
-        { 1, 42, 73 },
-        { 1, 17, 29 } },
-      { // Band 4
-        { 62, 190, 238 },
-        { 21, 159, 222 },
-        { 2, 107, 172 },
-        { 1, 72, 122 },
-        { 1, 40, 71 },
-        { 1, 18, 32 } },
-      { // Band 5
-        { 61, 199, 240 },
-        { 27, 161, 226 },
-        { 4, 113, 180 },
-        { 1, 76, 129 },
-        { 1, 46, 80 },
-        { 1, 23, 41 } } } }
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 125,  34, 187 }, {  52,  41, 133 }, {   6,  31,  56 }
+      }, {  // Band 1
+        {  37, 109, 153 }, {  51, 102, 147 }, {  23,  87, 128 },
+        {   8,  67, 101 }, {   1,  41,  63 }, {   1,  19,  29 }
+      }, {  // Band 2
+        {  31, 154, 185 }, {  17, 127, 175 }, {   6,  96, 145 },
+        {   2,  73, 114 }, {   1,  51,  82 }, {   1,  28,  45 }
+      }, {  // Band 3
+        {  23, 163, 200 }, {  10, 131, 185 }, {   2,  93, 148 },
+        {   1,  67, 111 }, {   1,  41,  69 }, {   1,  14,  24 }
+      }, {  // Band 4
+        {  29, 176, 217 }, {  12, 145, 201 }, {   3, 101, 156 },
+        {   1,  69, 111 }, {   1,  39,  63 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  57, 192, 233 }, {  25, 154, 215 }, {   6, 109, 167 },
+        {   3,  78, 118 }, {   1,  48,  69 }, {   1,  21,  29 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 202, 105, 245 }, { 108, 106, 216 }, {  18,  90, 144 }
+      }, {  // Band 1
+        {  33, 172, 219 }, {  64, 149, 206 }, {  14, 117, 177 },
+        {   5,  90, 141 }, {   2,  61,  95 }, {   1,  37,  57 }
+      }, {  // Band 2
+        {  33, 179, 220 }, {  11, 140, 198 }, {   1,  89, 148 },
+        {   1,  60, 104 }, {   1,  33,  57 }, {   1,  12,  21 }
+      }, {  // Band 3
+        {  30, 181, 221 }, {   8, 141, 198 }, {   1,  87, 145 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  32, 186, 224 }, {   7, 142, 198 }, {   1,  86, 143 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  22 }
+      }, {  // Band 5
+        {  57, 192, 227 }, {  20, 143, 204 }, {   3,  96, 154 },
+        {   1,  68, 112 }, {   1,  42,  69 }, {   1,  19,  32 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 212,  35, 215 }, { 113,  47, 169 }, {  29,  48, 105 }
+      }, {  // Band 1
+        {  74, 129, 203 }, { 106, 120, 203 }, {  49, 107, 178 },
+        {  19,  84, 144 }, {   4,  50,  84 }, {   1,  15,  25 }
+      }, {  // Band 2
+        {  71, 172, 217 }, {  44, 141, 209 }, {  15, 102, 173 },
+        {   6,  76, 133 }, {   2,  51,  89 }, {   1,  24,  42 }
+      }, {  // Band 3
+        {  64, 185, 231 }, {  31, 148, 216 }, {   8, 103, 175 },
+        {   3,  74, 131 }, {   1,  46,  81 }, {   1,  18,  30 }
+      }, {  // Band 4
+        {  65, 196, 235 }, {  25, 157, 221 }, {   5, 105, 174 },
+        {   1,  67, 120 }, {   1,  38,  69 }, {   1,  15,  30 }
+      }, {  // Band 5
+        {  65, 204, 238 }, {  30, 156, 224 }, {   7, 107, 177 },
+        {   2,  70, 124 }, {   1,  42,  73 }, {   1,  18,  34 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 225,  86, 251 }, { 144, 104, 235 }, {  42,  99, 181 }
+      }, {  // Band 1
+        {  85, 175, 239 }, { 112, 165, 229 }, {  29, 136, 200 },
+        {  12, 103, 162 }, {   6,  77, 123 }, {   2,  53,  84 }
+      }, {  // Band 2
+        {  75, 183, 239 }, {  30, 155, 221 }, {   3, 106, 171 },
+        {   1,  74, 128 }, {   1,  44,  76 }, {   1,  17,  28 }
+      }, {  // Band 3
+        {  73, 185, 240 }, {  27, 159, 222 }, {   2, 107, 172 },
+        {   1,  75, 127 }, {   1,  42,  73 }, {   1,  17,  29 }
+      }, {  // Band 4
+        {  62, 190, 238 }, {  21, 159, 222 }, {   2, 107, 172 },
+        {   1,  72, 122 }, {   1,  40,  71 }, {   1,  18,  32 }
+      }, {  // Band 5
+        {  61, 199, 240 }, {  27, 161, 226 }, {   4, 113, 180 },
+        {   1,  76, 129 }, {   1,  46,  80 }, {   1,  23,  41 }
+      }
+    }
+  }
 };
 
 static const av1_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { 7, 27, 153 },
-        { 5, 30, 95 },
-        { 1, 16, 30 } },
-      { // Band 1
-        { 50, 75, 127 },
-        { 57, 75, 124 },
-        { 27, 67, 108 },
-        { 10, 54, 86 },
-        { 1, 33, 52 },
-        { 1, 12, 18 } },
-      { // Band 2
-        { 43, 125, 151 },
-        { 26, 108, 148 },
-        { 7, 83, 122 },
-        { 2, 59, 89 },
-        { 1, 38, 60 },
-        { 1, 17, 27 } },
-      { // Band 3
-        { 23, 144, 163 },
-        { 13, 112, 154 },
-        { 2, 75, 117 },
-        { 1, 50, 81 },
-        { 1, 31, 51 },
-        { 1, 14, 23 } },
-      { // Band 4
-        { 18, 162, 185 },
-        { 6, 123, 171 },
-        { 1, 78, 125 },
-        { 1, 51, 86 },
-        { 1, 31, 54 },
-        { 1, 14, 23 } },
-      { // Band 5
-        { 15, 199, 227 },
-        { 3, 150, 204 },
-        { 1, 91, 146 },
-        { 1, 55, 95 },
-        { 1, 30, 53 },
-        { 1, 11, 20 } } },
-    {   // Inter
-      { // Band 0
-        { 19, 55, 240 },
-        { 19, 59, 196 },
-        { 3, 52, 105 } },
-      { // Band 1
-        { 41, 166, 207 },
-        { 104, 153, 199 },
-        { 31, 123, 181 },
-        { 14, 101, 152 },
-        { 5, 72, 106 },
-        { 1, 36, 52 } },
-      { // Band 2
-        { 35, 176, 211 },
-        { 12, 131, 190 },
-        { 2, 88, 144 },
-        { 1, 60, 101 },
-        { 1, 36, 60 },
-        { 1, 16, 28 } },
-      { // Band 3
-        { 28, 183, 213 },
-        { 8, 134, 191 },
-        { 1, 86, 142 },
-        { 1, 56, 96 },
-        { 1, 30, 53 },
-        { 1, 12, 20 } },
-      { // Band 4
-        { 20, 190, 215 },
-        { 4, 135, 192 },
-        { 1, 84, 139 },
-        { 1, 53, 91 },
-        { 1, 28, 49 },
-        { 1, 11, 20 } },
-      { // Band 5
-        { 13, 196, 216 },
-        { 2, 137, 192 },
-        { 1, 86, 143 },
-        { 1, 57, 99 },
-        { 1, 32, 56 },
-        { 1, 13, 24 } } } },
-  {     // UV plane
-    {   // Intra
-      { // Band 0
-        { 211, 29, 217 },
-        { 96, 47, 156 },
-        { 22, 43, 87 } },
-      { // Band 1
-        { 78, 120, 193 },
-        { 111, 116, 186 },
-        { 46, 102, 164 },
-        { 15, 80, 128 },
-        { 2, 49, 76 },
-        { 1, 18, 28 } },
-      { // Band 2
-        { 71, 161, 203 },
-        { 42, 132, 192 },
-        { 10, 98, 150 },
-        { 3, 69, 109 },
-        { 1, 44, 70 },
-        { 1, 18, 29 } },
-      { // Band 3
-        { 57, 186, 211 },
-        { 30, 140, 196 },
-        { 4, 93, 146 },
-        { 1, 62, 102 },
-        { 1, 38, 65 },
-        { 1, 16, 27 } },
-      { // Band 4
-        { 47, 199, 217 },
-        { 14, 145, 196 },
-        { 1, 88, 142 },
-        { 1, 57, 98 },
-        { 1, 36, 62 },
-        { 1, 15, 26 } },
-      { // Band 5
-        { 26, 219, 229 },
-        { 5, 155, 207 },
-        { 1, 94, 151 },
-        { 1, 60, 104 },
-        { 1, 36, 62 },
-        { 1, 16, 28 } } },
-    {   // Inter
-      { // Band 0
-        { 233, 29, 248 },
-        { 146, 47, 220 },
-        { 43, 52, 140 } },
-      { // Band 1
-        { 100, 163, 232 },
-        { 179, 161, 222 },
-        { 63, 142, 204 },
-        { 37, 113, 174 },
-        { 26, 89, 137 },
-        { 18, 68, 97 } },
-      { // Band 2
-        { 85, 181, 230 },
-        { 32, 146, 209 },
-        { 7, 100, 164 },
-        { 3, 71, 121 },
-        { 1, 45, 77 },
-        { 1, 18, 30 } },
-      { // Band 3
-        { 65, 187, 230 },
-        { 20, 148, 207 },
-        { 2, 97, 159 },
-        { 1, 68, 116 },
-        { 1, 40, 70 },
-        { 1, 14, 29 } },
-      { // Band 4
-        { 40, 194, 227 },
-        { 8, 147, 204 },
-        { 1, 94, 155 },
-        { 1, 65, 112 },
-        { 1, 39, 66 },
-        { 1, 14, 26 } },
-      { // Band 5
-        { 16, 208, 228 },
-        { 3, 151, 207 },
-        { 1, 98, 160 },
-        { 1, 67, 117 },
-        { 1, 41, 74 },
-        { 1, 17, 31 } } } }
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {   7,  27, 153 }, {   5,  30,  95 }, {   1,  16,  30 }
+      }, {  // Band 1
+        {  50,  75, 127 }, {  57,  75, 124 }, {  27,  67, 108 },
+        {  10,  54,  86 }, {   1,  33,  52 }, {   1,  12,  18 }
+      }, {  // Band 2
+        {  43, 125, 151 }, {  26, 108, 148 }, {   7,  83, 122 },
+        {   2,  59,  89 }, {   1,  38,  60 }, {   1,  17,  27 }
+      }, {  // Band 3
+        {  23, 144, 163 }, {  13, 112, 154 }, {   2,  75, 117 },
+        {   1,  50,  81 }, {   1,  31,  51 }, {   1,  14,  23 }
+      }, {  // Band 4
+        {  18, 162, 185 }, {   6, 123, 171 }, {   1,  78, 125 },
+        {   1,  51,  86 }, {   1,  31,  54 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  15, 199, 227 }, {   3, 150, 204 }, {   1,  91, 146 },
+        {   1,  55,  95 }, {   1,  30,  53 }, {   1,  11,  20 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  19,  55, 240 }, {  19,  59, 196 }, {   3,  52, 105 }
+      }, {  // Band 1
+        {  41, 166, 207 }, { 104, 153, 199 }, {  31, 123, 181 },
+        {  14, 101, 152 }, {   5,  72, 106 }, {   1,  36,  52 }
+      }, {  // Band 2
+        {  35, 176, 211 }, {  12, 131, 190 }, {   2,  88, 144 },
+        {   1,  60, 101 }, {   1,  36,  60 }, {   1,  16,  28 }
+      }, {  // Band 3
+        {  28, 183, 213 }, {   8, 134, 191 }, {   1,  86, 142 },
+        {   1,  56,  96 }, {   1,  30,  53 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  20, 190, 215 }, {   4, 135, 192 }, {   1,  84, 139 },
+        {   1,  53,  91 }, {   1,  28,  49 }, {   1,  11,  20 }
+      }, {  // Band 5
+        {  13, 196, 216 }, {   2, 137, 192 }, {   1,  86, 143 },
+        {   1,  57,  99 }, {   1,  32,  56 }, {   1,  13,  24 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 211,  29, 217 }, {  96,  47, 156 }, {  22,  43,  87 }
+      }, {  // Band 1
+        {  78, 120, 193 }, { 111, 116, 186 }, {  46, 102, 164 },
+        {  15,  80, 128 }, {   2,  49,  76 }, {   1,  18,  28 }
+      }, {  // Band 2
+        {  71, 161, 203 }, {  42, 132, 192 }, {  10,  98, 150 },
+        {   3,  69, 109 }, {   1,  44,  70 }, {   1,  18,  29 }
+      }, {  // Band 3
+        {  57, 186, 211 }, {  30, 140, 196 }, {   4,  93, 146 },
+        {   1,  62, 102 }, {   1,  38,  65 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  47, 199, 217 }, {  14, 145, 196 }, {   1,  88, 142 },
+        {   1,  57,  98 }, {   1,  36,  62 }, {   1,  15,  26 }
+      }, {  // Band 5
+        {  26, 219, 229 }, {   5, 155, 207 }, {   1,  94, 151 },
+        {   1,  60, 104 }, {   1,  36,  62 }, {   1,  16,  28 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 233,  29, 248 }, { 146,  47, 220 }, {  43,  52, 140 }
+      }, {  // Band 1
+        { 100, 163, 232 }, { 179, 161, 222 }, {  63, 142, 204 },
+        {  37, 113, 174 }, {  26,  89, 137 }, {  18,  68,  97 }
+      }, {  // Band 2
+        {  85, 181, 230 }, {  32, 146, 209 }, {   7, 100, 164 },
+        {   3,  71, 121 }, {   1,  45,  77 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  65, 187, 230 }, {  20, 148, 207 }, {   2,  97, 159 },
+        {   1,  68, 116 }, {   1,  40,  70 }, {   1,  14,  29 }
+      }, {  // Band 4
+        {  40, 194, 227 }, {   8, 147, 204 }, {   1,  94, 155 },
+        {   1,  65, 112 }, {   1,  39,  66 }, {   1,  14,  26 }
+      }, {  // Band 5
+        {  16, 208, 228 }, {   3, 151, 207 }, {   1,  98, 160 },
+        {   1,  67, 117 }, {   1,  41,  74 }, {   1,  17,  31 }
+      }
+    }
+  }
 };
 
 static const av1_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
-  {     // Y plane
-    {   // Intra
-      { // Band 0
-        { 17, 38, 140 },
-        { 7, 34, 80 },
-        { 1, 17, 29 } },
-      { // Band 1
-        { 37, 75, 128 },
-        { 41, 76, 128 },
-        { 26, 66, 116 },
-        { 12, 52, 94 },
-        { 2, 32, 55 },
-        { 1, 10, 16 } },
-      { // Band 2
-        { 50, 127, 154 },
-        { 37, 109, 152 },
-        { 16, 82, 121 },
-        { 5, 59, 85 },
-        { 1, 35, 54 },
-        { 1, 13, 20 } },
-      { // Band 3
-        { 40, 142, 167 },
-        { 17, 110, 157 },
-        { 2, 71, 112 },
-        { 1, 44, 72 },
-        { 1, 27, 45 },
-        { 1, 11, 17 } },
-      { // Band 4
-        { 30, 175, 188 },
-        { 9, 124, 169 },
-        { 1, 74, 116 },
-        { 1, 48, 78 },
-        { 1, 30, 49 },
-        { 1, 11, 18 } },
-      { // Band 5
-        { 10, 222, 223 },
-        { 2, 150, 194 },
-        { 1, 83, 128 },
-        { 1, 48, 79 },
-        { 1, 27, 45 },
-        { 1, 11, 17 } } },
-    {   // Inter
-      { // Band 0
-        { 36, 41, 235 },
-        { 29, 36, 193 },
-        { 10, 27, 111 } },
-      { // Band 1
-        { 85, 165, 222 },
-        { 177, 162, 215 },
-        { 110, 135, 195 },
-        { 57, 113, 168 },
-        { 23, 83, 120 },
-        { 10, 49, 61 } },
-      { // Band 2
-        { 85, 190, 223 },
-        { 36, 139, 200 },
-        { 5, 90, 146 },
-        { 1, 60, 103 },
-        { 1, 38, 65 },
-        { 1, 18, 30 } },
-      { // Band 3
-        { 72, 202, 223 },
-        { 23, 141, 199 },
-        { 2, 86, 140 },
-        { 1, 56, 97 },
-        { 1, 36, 61 },
-        { 1, 16, 27 } },
-      { // Band 4
-        { 55, 218, 225 },
-        { 13, 145, 200 },
-        { 1, 86, 141 },
-        { 1, 57, 99 },
-        { 1, 35, 61 },
-        { 1, 13, 22 } },
-      { // Band 5
-        { 15, 235, 212 },
-        { 1, 132, 184 },
-        { 1, 84, 139 },
-        { 1, 57, 97 },
-        { 1, 34, 56 },
-        { 1, 14, 23 } } } },
-  {     // UV plane
-    {   // Intra
-      { // Band 0
-        { 181, 21, 201 },
-        { 61, 37, 123 },
-        { 10, 38, 71 } },
-      { // Band 1
-        { 47, 106, 172 },
-        { 95, 104, 173 },
-        { 42, 93, 159 },
-        { 18, 77, 131 },
-        { 4, 50, 81 },
-        { 1, 17, 23 } },
-      { // Band 2
-        { 62, 147, 199 },
-        { 44, 130, 189 },
-        { 28, 102, 154 },
-        { 18, 75, 115 },
-        { 2, 44, 65 },
-        { 1, 12, 19 } },
-      { // Band 3
-        { 55, 153, 210 },
-        { 24, 130, 194 },
-        { 3, 93, 146 },
-        { 1, 61, 97 },
-        { 1, 31, 50 },
-        { 1, 10, 16 } },
-      { // Band 4
-        { 49, 186, 223 },
-        { 17, 148, 204 },
-        { 1, 96, 142 },
-        { 1, 53, 83 },
-        { 1, 26, 44 },
-        { 1, 11, 17 } },
-      { // Band 5
-        { 13, 217, 212 },
-        { 2, 136, 180 },
-        { 1, 78, 124 },
-        { 1, 50, 83 },
-        { 1, 29, 49 },
-        { 1, 14, 23 } } },
-    {   // Inter
-      { // Band 0
-        { 197, 13, 247 },
-        { 82, 17, 222 },
-        { 25, 17, 162 } },
-      { // Band 1
-        { 126, 186, 247 },
-        { 234, 191, 243 },
-        { 176, 177, 234 },
-        { 104, 158, 220 },
-        { 66, 128, 186 },
-        { 55, 90, 137 } },
-      { // Band 2
-        { 111, 197, 242 },
-        { 46, 158, 219 },
-        { 9, 104, 171 },
-        { 2, 65, 125 },
-        { 1, 44, 80 },
-        { 1, 17, 91 } },
-      { // Band 3
-        { 104, 208, 245 },
-        { 39, 168, 224 },
-        { 3, 109, 162 },
-        { 1, 79, 124 },
-        { 1, 50, 102 },
-        { 1, 43, 102 } },
-      { // Band 4
-        { 84, 220, 246 },
-        { 31, 177, 231 },
-        { 2, 115, 180 },
-        { 1, 79, 134 },
-        { 1, 55, 77 },
-        { 1, 60, 79 } },
-      { // Band 5
-        { 43, 243, 240 },
-        { 8, 180, 217 },
-        { 1, 115, 166 },
-        { 1, 84, 121 },
-        { 1, 51, 67 },
-        { 1, 16, 6 } } } }
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
+  }
 };
+#endif  // CONFIG_ENTROPY
+/* clang-format on */
 
 static void extend_to_full_distribution(aom_prob *probs, aom_prob p) {
-  // TODO(aconverse): model[PIVOT_NODE] should never be zero.
-  // https://code.google.com/p/webm/issues/detail?id=1089
-  memcpy(probs, av1_pareto8_full[p == 0 ? 254 : p - 1],
-         MODEL_NODES * sizeof(aom_prob));
+  assert(p != 0);
+  memcpy(probs, av1_pareto8_full[p - 1], MODEL_NODES * sizeof(aom_prob));
 }
 
 void av1_model_to_full_probs(const aom_prob *model, aom_prob *full) {
@@ -1333,7 +2807,7 @@
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
-#if CONFIG_EC_MULTISYMBOL
+#if CONFIG_RANS || CONFIG_DAALA_EC
 static void build_token_cdfs(const aom_prob *pdf_model,
                              aom_cdf_prob cdf[ENTROPY_TOKENS]) {
   int i, sum = 0;
@@ -1354,25 +2828,27 @@
             build_token_cdfs(fc->coef_probs[t][i][j][k][l],
                              fc->coef_cdfs[t][i][j][k][l]);
 }
-#endif  // CONFIG_EC_MULTISYMBOL
+#endif  // CONFIG_RANS
 
 void av1_default_coef_probs(AV1_COMMON *cm) {
+#if CONFIG_ENTROPY
+  const int index = AOMMIN(
+      ROUND_POWER_OF_TWO(cm->base_qindex, 8 - QCTX_BIN_BITS), QCTX_BINS - 1);
+  av1_copy(cm->fc->coef_probs, default_qctx_coef_probs[index]);
+#else
+#if CONFIG_CB4X4
+  av1_copy(cm->fc->coef_probs[TX_2X2], default_coef_probs_4x4);
+#endif
   av1_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
   av1_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
   av1_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
   av1_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
-#if CONFIG_EC_MULTISYMBOL
+#endif  // CONFIG_ENTROPY
+#if CONFIG_RANS || CONFIG_DAALA_EC
   av1_coef_pareto_cdfs(cm->fc);
-#endif  // CONFIG_EC_MULTISYMBOL
+#endif  // CONFIG_RANS
 }
 
-#define COEF_COUNT_SAT 24
-#define COEF_MAX_UPDATE_FACTOR 112
-#define COEF_COUNT_SAT_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_KEY 112
-#define COEF_COUNT_SAT_AFTER_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-
 #if CONFIG_ADAPT_SCAN
 #define ADAPT_SCAN_UPDATE_RATE_16 (1 << 13)
 #endif
@@ -1382,10 +2858,19 @@
                              unsigned int update_factor) {
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   av1_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
+#if CONFIG_ENTROPY
+  const av1_coeff_probs_model *const pre_probs =
+      cm->partial_prob_update
+          ? (const av1_coeff_probs_model *)cm->starting_coef_probs[tx_size]
+          : pre_fc->coef_probs[tx_size];
+#else
   const av1_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
-  av1_coeff_count_model *counts = cm->counts.coef[tx_size];
-  unsigned int(*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
-      cm->counts.eob_branch[tx_size];
+#endif  // CONFIG_ENTROPY
+  const av1_coeff_count_model *const counts =
+      (const av1_coeff_count_model *)cm->counts.coef[tx_size];
+  const unsigned int(*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      (const unsigned int(*)[REF_TYPES][COEF_BANDS]
+                            [COEFF_CONTEXTS])cm->counts.eob_branch[tx_size];
   int i, j, k, l, m;
 
   for (i = 0; i < PLANE_TYPES; ++i)
@@ -1401,38 +2886,59 @@
           };
           for (m = 0; m < UNCONSTRAINED_NODES; ++m)
             probs[i][j][k][l][m] =
-                merge_probs(pre_probs[i][j][k][l][m], branch_ct[m], count_sat,
-                            update_factor);
+                av1_merge_probs(pre_probs[i][j][k][l][m], branch_ct[m],
+                                count_sat, update_factor);
         }
 }
 
 void av1_adapt_coef_probs(AV1_COMMON *cm) {
-#if CONFIG_ADAPT_SCAN
-  TX_TYPE tx_type;
   TX_SIZE tx_size;
-#endif
-  TX_SIZE t;
   unsigned int count_sat, update_factor;
 
-  if (frame_is_intra_only(cm)) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
-    count_sat = COEF_COUNT_SAT_KEY;
-  } else if (cm->last_frame_type == KEY_FRAME) {
+#if CONFIG_ADAPT_SCAN
+  TX_TYPE tx_type;
+#endif
+
+#if CONFIG_ENTROPY
+  if (!frame_is_intra_only(cm) && cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY_BITS; /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY_BITS;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR_BITS;
+    count_sat = COEF_COUNT_SAT_BITS;
+  }
+  if (cm->partial_prob_update == 1) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_BITS;
+  }
+#else
+  if (!frame_is_intra_only(cm) && cm->last_frame_type == KEY_FRAME) {
     update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */
     count_sat = COEF_COUNT_SAT_AFTER_KEY;
   } else {
     update_factor = COEF_MAX_UPDATE_FACTOR;
     count_sat = COEF_COUNT_SAT;
   }
+#endif  // CONFIG_ENTROPY
+  for (tx_size = 0; tx_size < TX_SIZES; tx_size++)
+    adapt_coef_probs(cm, tx_size, count_sat, update_factor);
 
 #if CONFIG_ADAPT_SCAN
-  for (tx_size = 0; tx_size < TX_SIZES; ++tx_size)
-    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+  for (tx_size = TX_4X4; tx_size < TX_SIZES; ++tx_size)
+    for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
       av1_update_scan_prob(cm, tx_size, tx_type, ADAPT_SCAN_UPDATE_RATE_16);
       av1_update_scan_order_facade(cm, tx_size, tx_type);
     }
 #endif
-
-  for (t = 0; t <= TX_32X32; t++)
-    adapt_coef_probs(cm, t, count_sat, update_factor);
 }
+
+#if CONFIG_ENTROPY
+void av1_partial_adapt_probs(AV1_COMMON *cm, int mi_row, int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    cm->partial_prob_update = 1;
+    av1_adapt_coef_probs(cm);
+  }
+}
+#endif  // CONFIG_ENTROPY

diff --git a/av1/common/entropy.h b/av1/common/entropy.h
index 92719db..55ed8d4 100644
--- a/av1/common/entropy.h
+++ b/av1/common/entropy.h

@@ -26,6 +26,12 @@
 #define DIFF_UPDATE_PROB 252
 #define GROUP_DIFF_UPDATE_PROB 252
 
+#if CONFIG_ENTROPY
+#define COEF_PROBS_BUFS 16
+#define QCTX_BIN_BITS 2
+#define QCTX_BINS (1 << QCTX_BIN_BITS)
+#endif  // CONFIG_ENTROPY
+
 // Coefficient token alphabet
 #define ZERO_TOKEN 0        // 0     Extra Bits 0+0
 #define ONE_TOKEN 1         // 1     Extra Bits 0+1
@@ -122,7 +128,9 @@
    distinct bands). */
 
 #define COEFF_CONTEXTS 6
-#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS)
+#define COEFF_CONTEXTS0 3  // for band 0
+#define BAND_COEFF_CONTEXTS(band) \
+  ((band) == 0 ? COEFF_CONTEXTS0 : COEFF_CONTEXTS)
 
 // #define ENTROPY_STATS
 
@@ -137,6 +145,9 @@
 struct AV1Common;
 void av1_default_coef_probs(struct AV1Common *cm);
 void av1_adapt_coef_probs(struct AV1Common *cm);
+#if CONFIG_ENTROPY
+void av1_partial_adapt_probs(struct AV1Common *cm, int mi_row, int mi_col);
+#endif  // CONFIG_ENTROPY
 
 // This is the index in the scan order beyond which all coefficients for
 // 8x8 transform and above are in the top band.
@@ -144,11 +155,23 @@
 #define MAXBAND_INDEX 21
 
 DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_8x8plus[1024]);
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x8_8x4[32]);
+#endif  // CONFIG_EXT_TX
 DECLARE_ALIGNED(16, extern const uint8_t, av1_coefband_trans_4x4[16]);
 
+DECLARE_ALIGNED(16, extern const uint16_t, band_count_table[TX_SIZES_ALL][8]);
+DECLARE_ALIGNED(16, extern const uint16_t,
+                band_cum_count_table[TX_SIZES_ALL][8]);
+
 static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  return tx_size == TX_4X4 ? av1_coefband_trans_4x4
-                           : av1_coefband_trans_8x8plus;
+  switch (tx_size) {
+    case TX_4X4: return av1_coefband_trans_4x4;
+#if CONFIG_EXT_TX
+    case TX_4X8: return av1_coefband_trans_4x8_8x4;
+#endif  // CONFIG_EXT_TX
+    default: return av1_coefband_trans_8x8plus;
+  }
 }
 
 // 128 lists of probabilities are stored for the following ONE node probs:
@@ -194,15 +217,35 @@
                                       const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
-#if CONFIG_CB4X4
-  if (tx_size == 0) assert(0);
-#endif
-
   switch (tx_size) {
     case TX_4X4:
       above_ec = a[0] != 0;
       left_ec = l[0] != 0;
       break;
+    case TX_4X8:
+      above_ec = a[0] != 0;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_8X4:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = l[0] != 0;
+      break;
+    case TX_8X16:
+      above_ec = !!*(const uint16_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
+    case TX_16X8:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint16_t *)l;
+      break;
+    case TX_16X32:
+      above_ec = !!*(const uint32_t *)a;
+      left_ec = !!*(const uint64_t *)l;
+      break;
+    case TX_32X16:
+      above_ec = !!*(const uint64_t *)a;
+      left_ec = !!*(const uint32_t *)l;
+      break;
     case TX_8X8:
       above_ec = !!*(const uint16_t *)a;
       left_ec = !!*(const uint16_t *)l;
@@ -217,10 +260,60 @@
       break;
     default: assert(0 && "Invalid transform size."); break;
   }
-
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
+#if CONFIG_RANS
+struct frame_contexts;
+void av1_coef_pareto_cdfs(struct frame_contexts *fc);
+#endif  // CONFIG_RANS
+
+#if CONFIG_ENTROPY
+#define COEF_COUNT_SAT_BITS 5
+#define COEF_MAX_UPDATE_FACTOR_BITS 7
+#define COEF_COUNT_SAT_AFTER_KEY_BITS 5
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY_BITS 7
+#define MODE_MV_COUNT_SAT_BITS 5
+#define MODE_MV_MAX_UPDATE_FACTOR_BITS 7
+
+#else
+
+#define COEF_COUNT_SAT 24
+#define COEF_MAX_UPDATE_FACTOR 112
+#define COEF_COUNT_SAT_AFTER_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
+
+#endif  // CONFIG_ENTROPY
+
+#if CONFIG_ADAPT_SCAN
+#define ADAPT_SCAN_UPDATE_RATE_16 (1 << 13)
+#endif
+
+static INLINE aom_prob av1_merge_probs(aom_prob pre_prob,
+                                       const unsigned int ct[2],
+                                       unsigned int count_sat,
+                                       unsigned int max_update_factor) {
+#if CONFIG_ENTROPY
+  const aom_prob prob = get_binary_prob(ct[0], ct[1]);
+  const unsigned int count =
+      AOMMIN(ct[0] + ct[1], (unsigned int)(1 << count_sat));
+  const unsigned int factor = count << (max_update_factor - count_sat);
+  return weighted_prob(pre_prob, prob, factor);
+#else
+  return merge_probs(pre_prob, ct, count_sat, max_update_factor);
+#endif  // CONFIG_ENTROPY
+}
+
+static INLINE aom_prob av1_mode_mv_merge_probs(aom_prob pre_prob,
+                                               const unsigned int ct[2]) {
+#if CONFIG_ENTROPY
+  return av1_merge_probs(pre_prob, ct, MODE_MV_COUNT_SAT_BITS,
+                         MODE_MV_MAX_UPDATE_FACTOR_BITS);
+#else
+  return mode_mv_merge_probs(pre_prob, ct);
+#endif  // CONFIG_ENTROPY
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/entropymode.c b/av1/common/entropymode.c
index 243532b..910ba0f 100644
--- a/av1/common/entropymode.c
+++ b/av1/common/entropymode.c

@@ -11,6 +11,7 @@
 
 #include "aom_mem/aom_mem.h"
 
+#include "av1/common/reconinter.h"
 #include "av1/common/scan.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/seg_common.h"
@@ -171,6 +172,38 @@
   { 101, 21, 107, 181, 192, 103, 19, 67, 125 }  // y = tm
 };
 
+#if CONFIG_EXT_PARTITION_TYPES
+static const aom_prob
+    default_partition_probs[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1] = {
+      // 8x8 -> 4x4
+      { 199, 122, 141, 128, 128, 128, 128 },  // a/l both not split
+      { 147, 63, 159, 128, 128, 128, 128 },   // a split, l not split
+      { 148, 133, 118, 128, 128, 128, 128 },  // l split, a not split
+      { 121, 104, 114, 128, 128, 128, 128 },  // a/l both split
+      // 16x16 -> 8x8
+      { 174, 73, 87, 128, 128, 128, 128 },  // a/l both not split
+      { 92, 41, 83, 128, 128, 128, 128 },   // a split, l not split
+      { 82, 99, 50, 128, 128, 128, 128 },   // l split, a not split
+      { 53, 39, 39, 128, 128, 128, 128 },   // a/l both split
+      // 32x32 -> 16x16
+      { 177, 58, 59, 128, 128, 128, 128 },  // a/l both not split
+      { 68, 26, 63, 128, 128, 128, 128 },   // a split, l not split
+      { 52, 79, 25, 128, 128, 128, 128 },   // l split, a not split
+      { 17, 14, 12, 128, 128, 128, 128 },   // a/l both split
+      // 64x64 -> 32x32
+      { 222, 34, 30, 128, 128, 128, 128 },  // a/l both not split
+      { 72, 16, 44, 128, 128, 128, 128 },   // a split, l not split
+      { 58, 32, 12, 128, 128, 128, 128 },   // l split, a not split
+      { 10, 7, 6, 128, 128, 128, 128 },     // a/l both split
+#if CONFIG_EXT_PARTITION
+      // 128x128 -> 64x64
+      { 222, 34, 30, 128, 128, 128, 128 },  // a/l both not split
+      { 72, 16, 44, 128, 128, 128, 128 },   // a split, l not split
+      { 58, 32, 12, 128, 128, 128, 128 },   // l split, a not split
+      { 10, 7, 6, 128, 128, 128, 128 },     // a/l both split
+#endif                                      // CONFIG_EXT_PARTITION
+    };
+#else
 static const aom_prob
     default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = {
       // 8x8 -> 4x4
@@ -193,7 +226,15 @@
       { 72, 16, 44 },   // a split, l not split
       { 58, 32, 12 },   // l split, a not split
       { 10, 7, 6 },     // a/l both split
+#if CONFIG_EXT_PARTITION
+      // 128x128 -> 64x64
+      { 222, 34, 30 },  // a/l both not split
+      { 72, 16, 44 },   // a split, l not split
+      { 58, 32, 12 },   // l split, a not split
+      { 10, 7, 6 },     // a/l both split
+#endif  // CONFIG_EXT_PARTITION
     };
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_REF_MV
 static const aom_prob default_newmv_prob[NEWMV_MODE_CONTEXTS] = {
@@ -210,10 +251,24 @@
 
 static const aom_prob default_drl_prob[DRL_MODE_CONTEXTS] = { 128, 160, 180,
                                                               128, 160 };
-#endif
+
+#if CONFIG_EXT_INTER
+static const aom_prob default_new2mv_prob = 180;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
 
 static const aom_prob
     default_inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] = {
+#if CONFIG_EXT_INTER
+      // TODO(zoeliu): To adjust the initial default probs
+      { 2, 173, 34, 173 },  // 0 = both zero mv
+      { 7, 145, 85, 145 },  // 1 = one zero mv + one a predicted mv
+      { 7, 166, 63, 166 },  // 2 = two predicted mvs
+      { 7, 94, 66, 128 },   // 3 = one predicted/zero and one new mv
+      { 8, 64, 46, 128 },   // 4 = two new mvs
+      { 17, 81, 31, 128 },  // 5 = one intra neighbour + x
+      { 25, 29, 30, 96 },   // 6 = two intra neighbours
+#else
       { 2, 173, 34 },  // 0 = both zero mv
       { 7, 145, 85 },  // 1 = one zero mv + one a predicted mv
       { 7, 166, 63 },  // 2 = two predicted mvs
@@ -221,17 +276,106 @@
       { 8, 64, 46 },   // 4 = two new mvs
       { 17, 81, 31 },  // 5 = one intra neighbour + x
       { 25, 29, 30 },  // 6 = two intra neighbours
+#endif  // CONFIG_EXT_INTER
     };
 
-int av1_intra_mode_ind[INTRA_MODES];
-int av1_intra_mode_inv[INTRA_MODES];
-int av1_inter_mode_ind[INTER_MODES];
-int av1_inter_mode_inv[INTER_MODES];
+#if CONFIG_EXT_INTER
+static const aom_prob default_inter_compound_mode_probs
+    [INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES - 1] = {
+      { 2, 173, 68, 192, 64, 192, 128, 180, 180 },   // 0 = both zero mv
+      { 7, 145, 160, 192, 64, 192, 128, 180, 180 },  // 1 = 1 zero + 1 predicted
+      { 7, 166, 126, 192, 64, 192, 128, 180, 180 },  // 2 = two predicted mvs
+      { 7, 94, 132, 192, 64, 192, 128, 180, 180 },   // 3 = 1 pred/zero, 1 new
+      { 8, 64, 64, 192, 64, 192, 128, 180, 180 },    // 4 = two new mvs
+      { 17, 81, 52, 192, 64, 192, 128, 180, 180 },   // 5 = one intra neighbour
+      { 25, 29, 50, 192, 64, 192, 128, 180, 180 },   // 6 = two intra neighbours
+    };
+
+static const aom_prob default_interintra_prob[BLOCK_SIZE_GROUPS] = {
+  208, 208, 208, 208,
+};
+
+static const aom_prob
+    default_interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1] = {
+      { 65, 32, 18, 144, 162, 194, 41, 51, 98 },   // block_size < 8x8
+      { 132, 68, 18, 165, 217, 196, 45, 40, 78 },  // block_size < 16x16
+      { 173, 80, 19, 176, 240, 193, 64, 35, 46 },  // block_size < 32x32
+      { 221, 135, 38, 194, 248, 121, 96, 85, 29 }  // block_size >= 32x32
+    };
+
+static const aom_prob default_wedge_interintra_prob[BLOCK_SIZES] = {
+  208, 208, 208, 208, 208, 208, 216, 216, 216, 224, 224, 224, 240,
+#if CONFIG_EXT_PARTITION
+  208, 208, 208
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static const aom_prob default_wedge_interinter_prob[BLOCK_SIZES] = {
+  208, 208, 208, 208, 208, 208, 216, 216, 216, 224, 224, 224, 240,
+#if CONFIG_EXT_PARTITION
+  255, 255, 255
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // CONFIG_EXT_INTER
+
+// Change this section appropriately once warped motion is supported
+#if CONFIG_MOTION_VAR && !CONFIG_WARPED_MOTION
+const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
+  -SIMPLE_TRANSLATION, -OBMC_CAUSAL
+};
+static const aom_prob default_motion_mode_prob[BLOCK_SIZES]
+                                              [MOTION_MODES - 1] = {
+                                                { 255 }, { 255 }, { 255 },
+                                                { 151 }, { 153 }, { 144 },
+                                                { 178 }, { 165 }, { 160 },
+                                                { 207 }, { 195 }, { 168 },
+                                                { 244 },
+#if CONFIG_EXT_PARTITION
+                                                { 252 }, { 252 }, { 252 },
+#endif  // CONFIG_EXT_PARTITION
+                                              };
+
+#elif !CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
+  -SIMPLE_TRANSLATION, -WARPED_CAUSAL
+};
+static const aom_prob default_motion_mode_prob[BLOCK_SIZES]
+                                              [MOTION_MODES - 1] = {
+                                                { 255 }, { 255 }, { 255 },
+                                                { 151 }, { 153 }, { 144 },
+                                                { 178 }, { 165 }, { 160 },
+                                                { 207 }, { 195 }, { 168 },
+                                                { 244 },
+#if CONFIG_EXT_PARTITION
+                                                { 252 }, { 252 }, { 252 },
+#endif  // CONFIG_EXT_PARTITION
+                                              };
+
+#elif CONFIG_MOTION_VAR && CONFIG_WARPED_MOTION
+
+const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
+  -SIMPLE_TRANSLATION, 2, -OBMC_CAUSAL, -WARPED_CAUSAL,
+};
+static const aom_prob default_motion_mode_prob[BLOCK_SIZES][MOTION_MODES - 1] =
+    {
+      { 255, 200 }, { 255, 200 }, { 255, 200 }, { 151, 200 }, { 153, 200 },
+      { 144, 200 }, { 178, 200 }, { 165, 200 }, { 160, 200 }, { 207, 200 },
+      { 195, 200 }, { 168, 200 }, { 244, 200 },
+#if CONFIG_EXT_PARTITION
+      { 252, 200 }, { 252, 200 }, { 252, 200 },
+#endif  // CONFIG_EXT_PARTITION
+    };
+#endif  // CONFIG_MOTION_VAR || !CONFIG_WARPED_MOTION
 
 #if CONFIG_DELTA_Q
 static const aom_prob default_delta_q_probs[DELTA_Q_CONTEXTS] = { 220, 220,
                                                                   220 };
 #endif
+int av1_intra_mode_ind[INTRA_MODES];
+int av1_intra_mode_inv[INTRA_MODES];
+int av1_inter_mode_ind[INTER_MODES];
+int av1_inter_mode_inv[INTER_MODES];
 
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
 const aom_tree_index av1_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
@@ -247,31 +391,64 @@
 };
 
 const aom_tree_index av1_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
-  -INTER_OFFSET(ZEROMV), 2, -INTER_OFFSET(NEARESTMV), 4, -INTER_OFFSET(NEARMV),
-  -INTER_OFFSET(NEWMV)
+  -INTER_OFFSET(ZEROMV),    2,
+  -INTER_OFFSET(NEARESTMV), 4,
+#if CONFIG_EXT_INTER
+  -INTER_OFFSET(NEARMV),    6,
+  -INTER_OFFSET(NEWMV),     -INTER_OFFSET(NEWFROMNEARMV)
+#else
+  -INTER_OFFSET(NEARMV),    -INTER_OFFSET(NEWMV)
+#endif  // CONFIG_EXT_INTER
 };
 
-#if CONFIG_MOTION_VAR
-const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)] = {
-  -SIMPLE_TRANSLATION, -OBMC_CAUSAL
+#if CONFIG_EXT_INTER
+/* clang-format off */
+const aom_tree_index av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)] = {
+  -II_DC_PRED, 2,                   /* 0 = II_DC_NODE     */
+  -II_TM_PRED, 4,                   /* 1 = II_TM_NODE     */
+  -II_V_PRED, 6,                    /* 2 = II_V_NODE      */
+  8, 12,                            /* 3 = II_COM_NODE    */
+  -II_H_PRED, 10,                   /* 4 = II_H_NODE      */
+  -II_D135_PRED, -II_D117_PRED,     /* 5 = II_D135_NODE   */
+  -II_D45_PRED, 14,                 /* 6 = II_D45_NODE    */
+  -II_D63_PRED, 16,                 /* 7 = II_D63_NODE    */
+  -II_D153_PRED, -II_D207_PRED      /* 8 = II_D153_NODE   */
 };
 
-// clang-format off
-static const aom_prob
-    default_motion_mode_prob[BLOCK_SIZES][MOTION_MODES - 1] = {
-        { 255 },
-        { 255 }, { 255 }, { 151 },
-        { 153 }, { 144 }, { 178 },
-        { 165 }, { 160 }, { 207 },
-        { 195 }, { 168 }, { 244 },
-    };
-// clang-format on
-#endif  // CONFIG_MOTION_VAR
+const aom_tree_index av1_inter_compound_mode_tree
+    [TREE_SIZE(INTER_COMPOUND_MODES)] = {
+  -INTER_COMPOUND_OFFSET(ZERO_ZEROMV), 2,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEARESTMV), 4,
+  6, -INTER_COMPOUND_OFFSET(NEW_NEWMV),
+  8, 12,
+  -INTER_COMPOUND_OFFSET(NEAR_NEARMV), 10,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEARMV),
+      -INTER_COMPOUND_OFFSET(NEAR_NEARESTMV),
+  14, 16,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARESTMV),
+  -INTER_COMPOUND_OFFSET(NEAR_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARMV)
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_INTER
 
 const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT
 };
 
+#if CONFIG_EXT_PARTITION_TYPES
+/* clang-format off */
+const aom_tree_index av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)] = {
+  -PARTITION_NONE, 2,
+  6, 4,
+  8, -PARTITION_SPLIT,
+  -PARTITION_HORZ, 10,
+  -PARTITION_VERT, 12,
+  -PARTITION_HORZ_A, -PARTITION_HORZ_B,
+  -PARTITION_VERT_A, -PARTITION_VERT_B
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static const aom_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
 };
@@ -281,8 +458,8 @@
 };
 
 #if CONFIG_EXT_REFS
-// TODO(zoeliu): To adjust the initial prob values.
-static const aom_prob default_comp_fwdref_p[REF_CONTEXTS][FWD_REFS - 1] = {
+static const aom_prob default_comp_ref_p[REF_CONTEXTS][FWD_REFS - 1] = {
+  // TODO(zoeliu): To adjust the initial prob values.
   { 33, 16, 16 },
   { 77, 74, 74 },
   { 142, 142, 142 },
@@ -293,8 +470,9 @@
   { 16 }, { 74 }, { 142 }, { 170 }, { 247 }
 };
 #else
-static const aom_prob default_comp_ref_p[REF_CONTEXTS] = { 50, 126, 123, 221,
-                                                           226 };
+static const aom_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
+  { 50 }, { 126 }, { 123 }, { 221 }, { 226 }
+};
 #endif  // CONFIG_EXT_REFS
 
 static const aom_prob default_single_ref_p[REF_CONTEXTS][SINGLE_REFS - 1] = {
@@ -309,23 +487,6 @@
 #endif  // CONFIG_EXT_REFS
 };
 
-#if CONFIG_CB4X4
-static const struct tx_probs default_tx_probs = {
-  { { 1, 3, 136, 37 }, { 1, 5, 52, 13 } },
-
-  { { 1, 20, 152 }, { 1, 15, 101 } },
-
-  { { 1, 100 }, { 1, 66 } }
-};
-#else
-static const struct tx_probs default_tx_probs = { { { 3, 136, 37 },
-                                                    { 5, 52, 13 } },
-
-                                                  { { 20, 152 }, { 15, 101 } },
-
-                                                  { { 100 }, { 66 } } };
-#endif
-
 #if CONFIG_PALETTE
 const aom_tree_index av1_palette_size_tree[TREE_SIZE(PALETTE_SIZES)] = {
   -TWO_COLORS,  2, -THREE_COLORS, 4,  -FOUR_COLORS,  6,
@@ -340,6 +501,10 @@
       { 116, 76, 107, 46, 65, 105 },  { 112, 82, 94, 40, 70, 112 },
       { 147, 124, 123, 58, 69, 103 }, { 180, 113, 136, 49, 45, 114 },
       { 107, 70, 87, 49, 154, 156 },  { 98, 105, 142, 63, 64, 152 },
+#if CONFIG_EXT_PARTITION
+      { 98, 105, 142, 63, 64, 152 },  { 98, 105, 142, 63, 64, 152 },
+      { 98, 105, 142, 63, 64, 152 },
+#endif  // CONFIG_EXT_PARTITION
     };
 
 const aom_prob
@@ -349,360 +514,395 @@
       { 59, 92, 131, 78, 92, 142 },     { 75, 118, 149, 84, 90, 128 },
       { 89, 87, 92, 66, 66, 128 },      { 67, 53, 54, 55, 66, 93 },
       { 120, 130, 83, 171, 75, 214 },   { 72, 55, 66, 68, 79, 107 },
+#if CONFIG_EXT_PARTITION
+      { 72, 55, 66, 68, 79, 107 },      { 72, 55, 66, 68, 79, 107 },
+      { 72, 55, 66, 68, 79, 107 },
+#endif  // CONFIG_EXT_PARTITION
     };
 
-const aom_prob av1_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES]
-                                              [PALETTE_Y_MODE_CONTEXTS] = {
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                                {
-                                                    240, 180, 100,
-                                                },
-                                              };
+const aom_prob
+    av1_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES]
+                                   [PALETTE_Y_MODE_CONTEXTS] = {
+                                     { 240, 180, 100 }, { 240, 180, 100 },
+                                     { 240, 180, 100 }, { 240, 180, 100 },
+                                     { 240, 180, 100 }, { 240, 180, 100 },
+                                     { 240, 180, 100 }, { 240, 180, 100 },
+                                     { 240, 180, 100 }, { 240, 180, 100 },
+#if CONFIG_EXT_PARTITION
+                                     { 240, 180, 100 }, { 240, 180, 100 },
+                                     { 240, 180, 100 },
+#endif  // CONFIG_EXT_PARTITION
+                                   };
 
 const aom_prob av1_default_palette_uv_mode_prob[2] = { 253, 229 };
 
 const aom_tree_index
     av1_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)] = {
+      { // 2 colors
+        -PALETTE_COLOR_ONE, -PALETTE_COLOR_TWO },
+      { // 3 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, -PALETTE_COLOR_THREE },
+      { // 4 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE,
+        -PALETTE_COLOR_FOUR },
+      { // 5 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, -PALETTE_COLOR_FIVE },
+      { // 6 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, -PALETTE_COLOR_SIX },
+      { // 7 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, 10, -PALETTE_COLOR_SIX,
+        -PALETTE_COLOR_SEVEN },
+      { // 8 colors
+        -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, 10, -PALETTE_COLOR_SIX, 12,
+        -PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT },
+    };
+
+// Note: Has to be non-zero to avoid any asserts triggering.
+#define UNUSED_PROB 128
+
+const aom_prob av1_default_palette_y_color_prob
+    [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
       {
           // 2 colors
-          -PALETTE_COLOR_ONE, -PALETTE_COLOR_TWO,
+          { 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 214, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 240, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 73, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 227, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 188, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 75, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 250, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 223, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 252, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
       },
       {
           // 3 colors
-          -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, -PALETTE_COLOR_THREE,
+          { 229, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 197, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 107, 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 27, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 230, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 37, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 67, 221, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 124, 230, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 195, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 99, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 205, 208, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 40, 235, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 251, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 237, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 253, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
       },
       {
           // 4 colors
-          -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE,
-          -PALETTE_COLOR_FOUR,
+          { 195, 87, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 143, 100, 123, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 94, 124, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 77, 91, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 39, 114, 178, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 222, 94, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 44, 203, 132, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 68, 175, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 110, 187, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 152, 91, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 70, 109, 181, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 133, 113, 164, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 47, 205, 133, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 247, 94, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 205, 122, 146, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 251, 100, 141, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
       },
       {
           // 5 colors
-          -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
-          -PALETTE_COLOR_FOUR, -PALETTE_COLOR_FIVE,
+          { 195, 65, 84, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 150, 76, 84, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 94, 110, 81, 117, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 79, 85, 91, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 26, 102, 139, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 220, 73, 91, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 38, 203, 86, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 61, 186, 72, 124, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 132, 199, 84, 128, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 172, 52, 62, 120, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 102, 89, 121, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 182, 48, 69, 186, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 36, 206, 87, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 249, 55, 67, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 218, 88, 75, 122, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 253, 64, 80, 119, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
       },
       {
           // 6 colors
-          -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
-          -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, -PALETTE_COLOR_SIX,
+          { 182, 54, 64, 75, 118, UNUSED_PROB, UNUSED_PROB },
+          { 126, 67, 70, 76, 116, UNUSED_PROB, UNUSED_PROB },
+          { 79, 92, 67, 85, 120, UNUSED_PROB, UNUSED_PROB },
+          { 63, 61, 81, 118, 132, UNUSED_PROB, UNUSED_PROB },
+          { 21, 80, 105, 83, 119, UNUSED_PROB, UNUSED_PROB },
+          { 215, 72, 74, 74, 111, UNUSED_PROB, UNUSED_PROB },
+          { 50, 176, 63, 79, 120, UNUSED_PROB, UNUSED_PROB },
+          { 72, 148, 66, 77, 120, UNUSED_PROB, UNUSED_PROB },
+          { 105, 177, 57, 78, 130, UNUSED_PROB, UNUSED_PROB },
+          { 150, 66, 66, 80, 127, UNUSED_PROB, UNUSED_PROB },
+          { 81, 76, 109, 85, 116, UNUSED_PROB, UNUSED_PROB },
+          { 113, 81, 62, 96, 148, UNUSED_PROB, UNUSED_PROB },
+          { 54, 179, 69, 82, 121, UNUSED_PROB, UNUSED_PROB },
+          { 244, 47, 48, 67, 118, UNUSED_PROB, UNUSED_PROB },
+          { 198, 83, 53, 65, 121, UNUSED_PROB, UNUSED_PROB },
+          { 250, 42, 51, 69, 110, UNUSED_PROB, UNUSED_PROB },
       },
       {
           // 7 colors
-          -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
-          -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, 10, -PALETTE_COLOR_SIX,
-          -PALETTE_COLOR_SEVEN,
+          { 182, 45, 54, 62, 74, 113, UNUSED_PROB },
+          { 124, 63, 57, 62, 77, 114, UNUSED_PROB },
+          { 77, 80, 56, 66, 76, 117, UNUSED_PROB },
+          { 63, 57, 69, 98, 85, 131, UNUSED_PROB },
+          { 19, 81, 98, 63, 80, 116, UNUSED_PROB },
+          { 215, 56, 60, 63, 68, 105, UNUSED_PROB },
+          { 50, 174, 50, 60, 79, 118, UNUSED_PROB },
+          { 68, 151, 50, 58, 73, 117, UNUSED_PROB },
+          { 104, 182, 53, 57, 79, 127, UNUSED_PROB },
+          { 156, 50, 51, 63, 77, 111, UNUSED_PROB },
+          { 88, 67, 97, 59, 82, 120, UNUSED_PROB },
+          { 114, 81, 46, 65, 103, 132, UNUSED_PROB },
+          { 55, 166, 57, 66, 82, 120, UNUSED_PROB },
+          { 245, 34, 38, 43, 63, 114, UNUSED_PROB },
+          { 203, 68, 45, 47, 60, 118, UNUSED_PROB },
+          { 250, 35, 37, 47, 66, 110, UNUSED_PROB },
       },
       {
           // 8 colors
-          -PALETTE_COLOR_ONE, 2, -PALETTE_COLOR_TWO, 4, -PALETTE_COLOR_THREE, 6,
-          -PALETTE_COLOR_FOUR, 8, -PALETTE_COLOR_FIVE, 10, -PALETTE_COLOR_SIX,
-          12, -PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT,
-      },
+          { 180, 43, 46, 50, 56, 69, 109 },
+          { 116, 53, 51, 49, 57, 73, 115 },
+          { 79, 70, 49, 50, 59, 74, 117 },
+          { 60, 54, 57, 70, 62, 83, 129 },
+          { 20, 73, 85, 52, 66, 81, 119 },
+          { 213, 56, 52, 49, 53, 62, 104 },
+          { 48, 161, 41, 45, 56, 77, 116 },
+          { 68, 139, 40, 47, 54, 71, 116 },
+          { 123, 166, 42, 43, 52, 76, 130 },
+          { 153, 44, 44, 47, 54, 79, 129 },
+          { 87, 64, 83, 49, 60, 75, 127 },
+          { 131, 68, 43, 48, 73, 96, 130 },
+          { 55, 152, 45, 51, 64, 77, 113 },
+          { 243, 30, 28, 33, 41, 65, 114 },
+          { 202, 56, 35, 36, 42, 63, 123 },
+          { 249, 31, 29, 32, 45, 68, 111 },
+      }
     };
 
-const aom_prob
-    av1_default_palette_y_color_prob[PALETTE_MAX_SIZE - 1]
-                                    [PALETTE_COLOR_CONTEXTS]
-                                    [PALETTE_COLORS - 1] = {
-                                      {
-                                          // 2 colors
-                                          { 230, 0, 0, 0, 0, 0, 0 },
-                                          { 214, 0, 0, 0, 0, 0, 0 },
-                                          { 0, 0, 0, 0, 0, 0, 0 },
-                                          { 0, 0, 0, 0, 0, 0, 0 },
-                                          { 0, 0, 0, 0, 0, 0, 0 },
-                                          { 240, 0, 0, 0, 0, 0, 0 },
-                                          { 73, 0, 0, 0, 0, 0, 0 },
-                                          { 0, 0, 0, 0, 0, 0, 0 },
-                                          { 130, 0, 0, 0, 0, 0, 0 },
-                                          { 227, 0, 0, 0, 0, 0, 0 },
-                                          { 0, 0, 0, 0, 0, 0, 0 },
-                                          { 188, 0, 0, 0, 0, 0, 0 },
-                                          { 75, 0, 0, 0, 0, 0, 0 },
-                                          { 250, 0, 0, 0, 0, 0, 0 },
-                                          { 223, 0, 0, 0, 0, 0, 0 },
-                                          { 252, 0, 0, 0, 0, 0, 0 },
-                                      },
-                                      {
-                                          // 3 colors
-                                          { 229, 137, 0, 0, 0, 0, 0 },
-                                          { 197, 120, 0, 0, 0, 0, 0 },
-                                          { 107, 195, 0, 0, 0, 0, 0 },
-                                          { 0, 0, 0, 0, 0, 0, 0 },
-                                          { 27, 151, 0, 0, 0, 0, 0 },
-                                          { 230, 130, 0, 0, 0, 0, 0 },
-                                          { 37, 230, 0, 0, 0, 0, 0 },
-                                          { 67, 221, 0, 0, 0, 0, 0 },
-                                          { 124, 230, 0, 0, 0, 0, 0 },
-                                          { 195, 109, 0, 0, 0, 0, 0 },
-                                          { 99, 122, 0, 0, 0, 0, 0 },
-                                          { 205, 208, 0, 0, 0, 0, 0 },
-                                          { 40, 235, 0, 0, 0, 0, 0 },
-                                          { 251, 132, 0, 0, 0, 0, 0 },
-                                          { 237, 186, 0, 0, 0, 0, 0 },
-                                          { 253, 112, 0, 0, 0, 0, 0 },
-                                      },
-                                      {
-                                          // 4 colors
-                                          { 195, 87, 128, 0, 0, 0, 0 },
-                                          { 143, 100, 123, 0, 0, 0, 0 },
-                                          { 94, 124, 119, 0, 0, 0, 0 },
-                                          { 77, 91, 130, 0, 0, 0, 0 },
-                                          { 39, 114, 178, 0, 0, 0, 0 },
-                                          { 222, 94, 125, 0, 0, 0, 0 },
-                                          { 44, 203, 132, 0, 0, 0, 0 },
-                                          { 68, 175, 122, 0, 0, 0, 0 },
-                                          { 110, 187, 124, 0, 0, 0, 0 },
-                                          { 152, 91, 128, 0, 0, 0, 0 },
-                                          { 70, 109, 181, 0, 0, 0, 0 },
-                                          { 133, 113, 164, 0, 0, 0, 0 },
-                                          { 47, 205, 133, 0, 0, 0, 0 },
-                                          { 247, 94, 136, 0, 0, 0, 0 },
-                                          { 205, 122, 146, 0, 0, 0, 0 },
-                                          { 251, 100, 141, 0, 0, 0, 0 },
-                                      },
-                                      {
-                                          // 5 colors
-                                          { 195, 65, 84, 125, 0, 0, 0 },
-                                          { 150, 76, 84, 121, 0, 0, 0 },
-                                          { 94, 110, 81, 117, 0, 0, 0 },
-                                          { 79, 85, 91, 139, 0, 0, 0 },
-                                          { 26, 102, 139, 127, 0, 0, 0 },
-                                          { 220, 73, 91, 119, 0, 0, 0 },
-                                          { 38, 203, 86, 127, 0, 0, 0 },
-                                          { 61, 186, 72, 124, 0, 0, 0 },
-                                          { 132, 199, 84, 128, 0, 0, 0 },
-                                          { 172, 52, 62, 120, 0, 0, 0 },
-                                          { 102, 89, 121, 122, 0, 0, 0 },
-                                          { 182, 48, 69, 186, 0, 0, 0 },
-                                          { 36, 206, 87, 126, 0, 0, 0 },
-                                          { 249, 55, 67, 122, 0, 0, 0 },
-                                          { 218, 88, 75, 122, 0, 0, 0 },
-                                          { 253, 64, 80, 119, 0, 0, 0 },
-                                      },
-                                      {
-                                          // 6 colors
-                                          { 182, 54, 64, 75, 118, 0, 0 },
-                                          { 126, 67, 70, 76, 116, 0, 0 },
-                                          { 79, 92, 67, 85, 120, 0, 0 },
-                                          { 63, 61, 81, 118, 132, 0, 0 },
-                                          { 21, 80, 105, 83, 119, 0, 0 },
-                                          { 215, 72, 74, 74, 111, 0, 0 },
-                                          { 50, 176, 63, 79, 120, 0, 0 },
-                                          { 72, 148, 66, 77, 120, 0, 0 },
-                                          { 105, 177, 57, 78, 130, 0, 0 },
-                                          { 150, 66, 66, 80, 127, 0, 0 },
-                                          { 81, 76, 109, 85, 116, 0, 0 },
-                                          { 113, 81, 62, 96, 148, 0, 0 },
-                                          { 54, 179, 69, 82, 121, 0, 0 },
-                                          { 244, 47, 48, 67, 118, 0, 0 },
-                                          { 198, 83, 53, 65, 121, 0, 0 },
-                                          { 250, 42, 51, 69, 110, 0, 0 },
-                                      },
-                                      {
-                                          // 7 colors
-                                          { 182, 45, 54, 62, 74, 113, 0 },
-                                          { 124, 63, 57, 62, 77, 114, 0 },
-                                          { 77, 80, 56, 66, 76, 117, 0 },
-                                          { 63, 57, 69, 98, 85, 131, 0 },
-                                          { 19, 81, 98, 63, 80, 116, 0 },
-                                          { 215, 56, 60, 63, 68, 105, 0 },
-                                          { 50, 174, 50, 60, 79, 118, 0 },
-                                          { 68, 151, 50, 58, 73, 117, 0 },
-                                          { 104, 182, 53, 57, 79, 127, 0 },
-                                          { 156, 50, 51, 63, 77, 111, 0 },
-                                          { 88, 67, 97, 59, 82, 120, 0 },
-                                          { 114, 81, 46, 65, 103, 132, 0 },
-                                          { 55, 166, 57, 66, 82, 120, 0 },
-                                          { 245, 34, 38, 43, 63, 114, 0 },
-                                          { 203, 68, 45, 47, 60, 118, 0 },
-                                          { 250, 35, 37, 47, 66, 110, 0 },
-                                      },
-                                      {
-                                          // 8 colors
-                                          { 180, 43, 46, 50, 56, 69, 109 },
-                                          { 116, 53, 51, 49, 57, 73, 115 },
-                                          { 79, 70, 49, 50, 59, 74, 117 },
-                                          { 60, 54, 57, 70, 62, 83, 129 },
-                                          { 20, 73, 85, 52, 66, 81, 119 },
-                                          { 213, 56, 52, 49, 53, 62, 104 },
-                                          { 48, 161, 41, 45, 56, 77, 116 },
-                                          { 68, 139, 40, 47, 54, 71, 116 },
-                                          { 123, 166, 42, 43, 52, 76, 130 },
-                                          { 153, 44, 44, 47, 54, 79, 129 },
-                                          { 87, 64, 83, 49, 60, 75, 127 },
-                                          { 131, 68, 43, 48, 73, 96, 130 },
-                                          { 55, 152, 45, 51, 64, 77, 113 },
-                                          { 243, 30, 28, 33, 41, 65, 114 },
-                                          { 202, 56, 35, 36, 42, 63, 123 },
-                                          { 249, 31, 29, 32, 45, 68, 111 },
-                                      }
-                                    };
+const aom_prob av1_default_palette_uv_color_prob
+    [PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+      {
+          // 2 colors
+          { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 195, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 228, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 71, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 98, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 236, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 222, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 249, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 3 colors
+          { 198, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 178, 105, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 100, 206, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB, UNUSED_PROB },
+          { 12, 136, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 219, 134, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 50, 198, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 61, 231, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 110, 209, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 173, 106, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 145, 166, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 156, 175, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 69, 183, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 241, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 224, 160, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+          { 246, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB,
+            UNUSED_PROB },
+      },
+      {
+          // 4 colors
+          { 173, 88, 143, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 146, 81, 127, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 84, 134, 102, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 69, 138, 140, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 31, 103, 200, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 217, 101, 139, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 51, 174, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 64, 177, 109, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 96, 179, 145, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 164, 77, 114, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 87, 94, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 105, 57, 173, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 63, 158, 137, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 236, 102, 156, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 197, 115, 153, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 245, 106, 154, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 5 colors
+          { 179, 64, 97, 129, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 137, 56, 88, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 82, 107, 61, 118, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 59, 113, 86, 115, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 23, 88, 118, 130, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 213, 66, 90, 125, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 37, 181, 103, 121, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 47, 188, 61, 131, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 104, 185, 103, 144, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 163, 39, 76, 112, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 94, 74, 131, 126, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 142, 42, 103, 163, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 53, 162, 99, 149, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 239, 54, 84, 108, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 203, 84, 110, 147, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+          { 248, 70, 105, 151, UNUSED_PROB, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 6 colors
+          { 189, 50, 67, 90, 130, UNUSED_PROB, UNUSED_PROB },
+          { 114, 50, 55, 90, 123, UNUSED_PROB, UNUSED_PROB },
+          { 66, 76, 54, 82, 128, UNUSED_PROB, UNUSED_PROB },
+          { 43, 69, 69, 80, 129, UNUSED_PROB, UNUSED_PROB },
+          { 22, 59, 87, 88, 141, UNUSED_PROB, UNUSED_PROB },
+          { 203, 49, 68, 87, 122, UNUSED_PROB, UNUSED_PROB },
+          { 43, 157, 74, 104, 146, UNUSED_PROB, UNUSED_PROB },
+          { 54, 138, 51, 95, 138, UNUSED_PROB, UNUSED_PROB },
+          { 82, 171, 58, 102, 146, UNUSED_PROB, UNUSED_PROB },
+          { 129, 38, 59, 64, 168, UNUSED_PROB, UNUSED_PROB },
+          { 56, 67, 119, 92, 112, UNUSED_PROB, UNUSED_PROB },
+          { 96, 62, 53, 132, 82, UNUSED_PROB, UNUSED_PROB },
+          { 60, 147, 77, 108, 145, UNUSED_PROB, UNUSED_PROB },
+          { 238, 76, 73, 93, 148, UNUSED_PROB, UNUSED_PROB },
+          { 189, 86, 73, 103, 157, UNUSED_PROB, UNUSED_PROB },
+          { 246, 62, 75, 83, 167, UNUSED_PROB, UNUSED_PROB },
+      },
+      {
+          // 7 colors
+          { 179, 42, 51, 73, 99, 134, UNUSED_PROB },
+          { 119, 52, 52, 61, 64, 114, UNUSED_PROB },
+          { 53, 77, 35, 65, 71, 131, UNUSED_PROB },
+          { 38, 70, 51, 68, 89, 144, UNUSED_PROB },
+          { 23, 65, 128, 73, 97, 131, UNUSED_PROB },
+          { 210, 47, 52, 63, 81, 143, UNUSED_PROB },
+          { 42, 159, 57, 68, 98, 143, UNUSED_PROB },
+          { 49, 153, 45, 82, 93, 143, UNUSED_PROB },
+          { 81, 169, 52, 72, 113, 151, UNUSED_PROB },
+          { 136, 46, 35, 56, 75, 96, UNUSED_PROB },
+          { 57, 84, 109, 47, 107, 131, UNUSED_PROB },
+          { 128, 78, 57, 36, 128, 85, UNUSED_PROB },
+          { 54, 149, 68, 77, 94, 153, UNUSED_PROB },
+          { 243, 58, 50, 71, 81, 167, UNUSED_PROB },
+          { 189, 92, 64, 70, 121, 173, UNUSED_PROB },
+          { 248, 35, 38, 51, 82, 201, UNUSED_PROB },
+      },
+      {
+          // 8 colors
+          { 201, 40, 36, 42, 64, 92, 123 },
+          { 116, 43, 33, 43, 73, 102, 128 },
+          { 46, 77, 37, 69, 62, 78, 150 },
+          { 40, 65, 52, 50, 76, 89, 133 },
+          { 28, 48, 91, 17, 64, 77, 133 },
+          { 218, 43, 43, 37, 56, 72, 163 },
+          { 41, 155, 44, 83, 82, 129, 180 },
+          { 44, 141, 29, 55, 64, 89, 147 },
+          { 92, 166, 48, 45, 59, 126, 179 },
+          { 169, 35, 49, 41, 36, 99, 139 },
+          { 55, 77, 77, 56, 60, 75, 156 },
+          { 155, 81, 51, 64, 57, 182, 255 },
+          { 60, 134, 49, 49, 93, 128, 174 },
+          { 244, 98, 51, 46, 22, 73, 238 },
+          { 189, 70, 40, 87, 93, 79, 201 },
+          { 248, 54, 49, 40, 29, 42, 227 },
+      }
+    };
 
-const aom_prob
-    av1_default_palette_uv_color_prob[PALETTE_MAX_SIZE - 1]
-                                     [PALETTE_COLOR_CONTEXTS]
-                                     [PALETTE_COLORS - 1] = {
-                                       {
-                                           // 2 colors
-                                           { 228, 0, 0, 0, 0, 0, 0 },
-                                           { 195, 0, 0, 0, 0, 0, 0 },
-                                           { 0, 0, 0, 0, 0, 0, 0 },
-                                           { 0, 0, 0, 0, 0, 0, 0 },
-                                           { 0, 0, 0, 0, 0, 0, 0 },
-                                           { 228, 0, 0, 0, 0, 0, 0 },
-                                           { 71, 0, 0, 0, 0, 0, 0 },
-                                           { 0, 0, 0, 0, 0, 0, 0 },
-                                           { 129, 0, 0, 0, 0, 0, 0 },
-                                           { 206, 0, 0, 0, 0, 0, 0 },
-                                           { 0, 0, 0, 0, 0, 0, 0 },
-                                           { 136, 0, 0, 0, 0, 0, 0 },
-                                           { 98, 0, 0, 0, 0, 0, 0 },
-                                           { 236, 0, 0, 0, 0, 0, 0 },
-                                           { 222, 0, 0, 0, 0, 0, 0 },
-                                           { 249, 0, 0, 0, 0, 0, 0 },
-                                       },
-                                       {
-                                           // 3 colors
-                                           { 198, 136, 0, 0, 0, 0, 0 },
-                                           { 178, 105, 0, 0, 0, 0, 0 },
-                                           { 100, 206, 0, 0, 0, 0, 0 },
-                                           { 0, 0, 0, 0, 0, 0, 0 },
-                                           { 12, 136, 0, 0, 0, 0, 0 },
-                                           { 219, 134, 0, 0, 0, 0, 0 },
-                                           { 50, 198, 0, 0, 0, 0, 0 },
-                                           { 61, 231, 0, 0, 0, 0, 0 },
-                                           { 110, 209, 0, 0, 0, 0, 0 },
-                                           { 173, 106, 0, 0, 0, 0, 0 },
-                                           { 145, 166, 0, 0, 0, 0, 0 },
-                                           { 156, 175, 0, 0, 0, 0, 0 },
-                                           { 69, 183, 0, 0, 0, 0, 0 },
-                                           { 241, 163, 0, 0, 0, 0, 0 },
-                                           { 224, 160, 0, 0, 0, 0, 0 },
-                                           { 246, 154, 0, 0, 0, 0, 0 },
-                                       },
-                                       {
-                                           // 4 colors
-                                           { 173, 88, 143, 0, 0, 0, 0 },
-                                           { 146, 81, 127, 0, 0, 0, 0 },
-                                           { 84, 134, 102, 0, 0, 0, 0 },
-                                           { 69, 138, 140, 0, 0, 0, 0 },
-                                           { 31, 103, 200, 0, 0, 0, 0 },
-                                           { 217, 101, 139, 0, 0, 0, 0 },
-                                           { 51, 174, 121, 0, 0, 0, 0 },
-                                           { 64, 177, 109, 0, 0, 0, 0 },
-                                           { 96, 179, 145, 0, 0, 0, 0 },
-                                           { 164, 77, 114, 0, 0, 0, 0 },
-                                           { 87, 94, 156, 0, 0, 0, 0 },
-                                           { 105, 57, 173, 0, 0, 0, 0 },
-                                           { 63, 158, 137, 0, 0, 0, 0 },
-                                           { 236, 102, 156, 0, 0, 0, 0 },
-                                           { 197, 115, 153, 0, 0, 0, 0 },
-                                           { 245, 106, 154, 0, 0, 0, 0 },
-                                       },
-                                       {
-                                           // 5 colors
-                                           { 179, 64, 97, 129, 0, 0, 0 },
-                                           { 137, 56, 88, 125, 0, 0, 0 },
-                                           { 82, 107, 61, 118, 0, 0, 0 },
-                                           { 59, 113, 86, 115, 0, 0, 0 },
-                                           { 23, 88, 118, 130, 0, 0, 0 },
-                                           { 213, 66, 90, 125, 0, 0, 0 },
-                                           { 37, 181, 103, 121, 0, 0, 0 },
-                                           { 47, 188, 61, 131, 0, 0, 0 },
-                                           { 104, 185, 103, 144, 0, 0, 0 },
-                                           { 163, 39, 76, 112, 0, 0, 0 },
-                                           { 94, 74, 131, 126, 0, 0, 0 },
-                                           { 142, 42, 103, 163, 0, 0, 0 },
-                                           { 53, 162, 99, 149, 0, 0, 0 },
-                                           { 239, 54, 84, 108, 0, 0, 0 },
-                                           { 203, 84, 110, 147, 0, 0, 0 },
-                                           { 248, 70, 105, 151, 0, 0, 0 },
-                                       },
-                                       {
-                                           // 6 colors
-                                           { 189, 50, 67, 90, 130, 0, 0 },
-                                           { 114, 50, 55, 90, 123, 0, 0 },
-                                           { 66, 76, 54, 82, 128, 0, 0 },
-                                           { 43, 69, 69, 80, 129, 0, 0 },
-                                           { 22, 59, 87, 88, 141, 0, 0 },
-                                           { 203, 49, 68, 87, 122, 0, 0 },
-                                           { 43, 157, 74, 104, 146, 0, 0 },
-                                           { 54, 138, 51, 95, 138, 0, 0 },
-                                           { 82, 171, 58, 102, 146, 0, 0 },
-                                           { 129, 38, 59, 64, 168, 0, 0 },
-                                           { 56, 67, 119, 92, 112, 0, 0 },
-                                           { 96, 62, 53, 132, 82, 0, 0 },
-                                           { 60, 147, 77, 108, 145, 0, 0 },
-                                           { 238, 76, 73, 93, 148, 0, 0 },
-                                           { 189, 86, 73, 103, 157, 0, 0 },
-                                           { 246, 62, 75, 83, 167, 0, 0 },
-                                       },
-                                       {
-                                           // 7 colors
-                                           { 179, 42, 51, 73, 99, 134, 0 },
-                                           { 119, 52, 52, 61, 64, 114, 0 },
-                                           { 53, 77, 35, 65, 71, 131, 0 },
-                                           { 38, 70, 51, 68, 89, 144, 0 },
-                                           { 23, 65, 128, 73, 97, 131, 0 },
-                                           { 210, 47, 52, 63, 81, 143, 0 },
-                                           { 42, 159, 57, 68, 98, 143, 0 },
-                                           { 49, 153, 45, 82, 93, 143, 0 },
-                                           { 81, 169, 52, 72, 113, 151, 0 },
-                                           { 136, 46, 35, 56, 75, 96, 0 },
-                                           { 57, 84, 109, 47, 107, 131, 0 },
-                                           { 128, 78, 57, 36, 128, 85, 0 },
-                                           { 54, 149, 68, 77, 94, 153, 0 },
-                                           { 243, 58, 50, 71, 81, 167, 0 },
-                                           { 189, 92, 64, 70, 121, 173, 0 },
-                                           { 248, 35, 38, 51, 82, 201, 0 },
-                                       },
-                                       {
-                                           // 8 colors
-                                           { 201, 40, 36, 42, 64, 92, 123 },
-                                           { 116, 43, 33, 43, 73, 102, 128 },
-                                           { 46, 77, 37, 69, 62, 78, 150 },
-                                           { 40, 65, 52, 50, 76, 89, 133 },
-                                           { 28, 48, 91, 17, 64, 77, 133 },
-                                           { 218, 43, 43, 37, 56, 72, 163 },
-                                           { 41, 155, 44, 83, 82, 129, 180 },
-                                           { 44, 141, 29, 55, 64, 89, 147 },
-                                           { 92, 166, 48, 45, 59, 126, 179 },
-                                           { 169, 35, 49, 41, 36, 99, 139 },
-                                           { 55, 77, 77, 56, 60, 75, 156 },
-                                           { 155, 81, 51, 64, 57, 182, 255 },
-                                           { 60, 134, 49, 49, 93, 128, 174 },
-                                           { 244, 98, 51, 46, 22, 73, 238 },
-                                           { 189, 70, 40, 87, 93, 79, 201 },
-                                           { 248, 54, 49, 40, 29, 42, 227 },
-                                       }
-                                     };
+#undef UNUSED_PROB
 
 static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
   // (3, 0, 0, 0), (3, 2, 0, 0), (3, 3, 2, 0), (3, 3, 2, 2),
@@ -714,56 +914,111 @@
   // (7, 3, 0, 0), (8, 0, 0, 0), (8, 2, 0, 0), (10, 0, 0, 0)
   9680, 10648, 10890, 13310
 };
+#endif  // CONFIG_PALETTE
 
+const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)] = {
+  {
+      // Max tx_size is 8X8
+      -0, -1,
+  },
+  {
+      // Max tx_size is 16X16
+      -0, 2, -1, -2,
+  },
+  {
+      // Max tx_size is 32X32
+      -0, 2, -1, 4, -2, -3,
+  },
+};
+
+static const aom_prob default_tx_size_prob[MAX_TX_DEPTH][TX_SIZE_CONTEXTS]
+                                          [MAX_TX_DEPTH] = {
+                                            {
+                                                // Max tx_size is 8X8
+                                                { 100 },
+                                                { 66 },
+                                            },
+                                            {
+                                                // Max tx_size is 16X16
+                                                { 20, 152 },
+                                                { 15, 101 },
+                                            },
+                                            {
+                                                // Max tx_size is 32X32
+                                                { 3, 136, 37 },
+                                                { 5, 52, 13 },
+                                            },
+                                          };
+
+#if CONFIG_LOOP_RESTORATION
+const aom_tree_index
+    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)] = {
+      -RESTORE_NONE, 2, -RESTORE_BILATERAL, -RESTORE_WIENER,
+    };
+
+static const aom_prob
+    default_switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1] = { 32, 128 };
+#endif  // CONFIG_LOOP_RESTORATION
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_VAR_TX
+// the probability of (0) using recursive square tx partition vs.
+// (1) biggest rect tx for 4X8-8X4/8X16-16X8/16X32-32X16 blocks
+static const aom_prob default_rect_tx_prob[TX_SIZES - 1] = { 192, 192, 192 };
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX && CONFIG_VAR_TX
+
+#if CONFIG_PALETTE
 int av1_get_palette_color_context(const uint8_t *color_map, int cols, int r,
                                   int c, int n, uint8_t *color_order,
                                   int *color_idx) {
-  int i, j, max, max_idx, temp;
+  int i;
+  // The +10 below should not be needed. But we get a warning "array subscript
+  // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+  // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
   int scores[PALETTE_MAX_SIZE + 10];
-  int weights[4] = { 3, 2, 3, 2 };
-  int color_ctx = 0;
+  const int weights[4] = { 3, 2, 3, 2 };
+  int color_ctx_hash;
+  int color_ctx;
   int color_neighbors[4];
   int inverse_color_order[PALETTE_MAX_SIZE];
   assert(n <= PALETTE_MAX_SIZE);
-  if (c - 1 >= 0)
-    color_neighbors[0] = color_map[r * cols + c - 1];
-  else
-    color_neighbors[0] = -1;
-  if (c - 1 >= 0 && r - 1 >= 0)
-    color_neighbors[1] = color_map[(r - 1) * cols + c - 1];
-  else
-    color_neighbors[1] = -1;
-  if (r - 1 >= 0)
-    color_neighbors[2] = color_map[(r - 1) * cols + c];
-  else
-    color_neighbors[2] = -1;
-  if (r - 1 >= 0 && c + 1 <= cols - 1)
-    color_neighbors[3] = color_map[(r - 1) * cols + c + 1];
-  else
-    color_neighbors[3] = -1;
+
+  // Get color indices of neighbors.
+  color_neighbors[0] = (c - 1 >= 0) ? color_map[r * cols + c - 1] : -1;
+  color_neighbors[1] =
+      (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * cols + c - 1] : -1;
+  color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * cols + c] : -1;
+  color_neighbors[3] = (r - 1 >= 0 && c + 1 <= cols - 1)
+                           ? color_map[(r - 1) * cols + c + 1]
+                           : -1;
+
   for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
     color_order[i] = i;
     inverse_color_order[i] = i;
   }
   memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
   for (i = 0; i < 4; ++i) {
-    if (color_neighbors[i] >= 0) scores[color_neighbors[i]] += weights[i];
+    if (color_neighbors[i] >= 0) {
+      scores[color_neighbors[i]] += weights[i];
+    }
   }
+
+  // Get the top 4 scores (sorted from large to small).
   for (i = 0; i < 4; ++i) {
-    max = scores[i];
-    max_idx = i;
-    j = i + 1;
-    while (j < n) {
+    int max = scores[i];
+    int max_idx = i;
+    int j;
+    for (j = i + 1; j < n; ++j) {
       if (scores[j] > max) {
         max = scores[j];
         max_idx = j;
       }
-      ++j;
     }
+
     if (max_idx != i) {
-      temp = scores[i];
+      int temp = scores[i];
       scores[i] = scores[max_idx];
       scores[max_idx] = temp;
+
       temp = color_order[i];
       color_order[i] = color_order[max_idx];
       color_order[max_idx] = temp;
@@ -771,13 +1026,20 @@
       inverse_color_order[color_order[max_idx]] = max_idx;
     }
   }
-  for (i = 0; i < 4; ++i) color_ctx = color_ctx * 11 + scores[i];
-  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i)
-    if (color_ctx == palette_color_context_lookup[i]) {
+
+  // Get hash value of context.
+  color_ctx_hash = 0;
+  for (i = 0; i < 4; ++i) color_ctx_hash = color_ctx_hash * 11 + scores[i];
+
+  // Lookup context from hash.
+  color_ctx = 0;  // Default.
+  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i) {
+    if (color_ctx_hash == palette_color_context_lookup[i]) {
       color_ctx = i;
       break;
     }
-  if (color_ctx >= PALETTE_COLOR_CONTEXTS) color_ctx = 0;
+  }
+
   if (color_idx != NULL) {
     *color_idx = inverse_color_order[color_map[r * cols + c]];
   }
@@ -785,44 +1047,52 @@
 }
 #endif  // CONFIG_PALETTE
 
-void av1_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
-                                          unsigned int (*ct_32x32p)[2]) {
-  ct_32x32p[TX_4X4][0] = tx_count_32x32p[TX_4X4];
-  ct_32x32p[TX_4X4][1] = tx_count_32x32p[TX_8X8] + tx_count_32x32p[TX_16X16] +
-                         tx_count_32x32p[TX_32X32];
-  ct_32x32p[TX_8X8][0] = tx_count_32x32p[TX_8X8];
-  ct_32x32p[TX_8X8][1] = tx_count_32x32p[TX_16X16] + tx_count_32x32p[TX_32X32];
-  ct_32x32p[TX_16X16][0] = tx_count_32x32p[TX_16X16];
-  ct_32x32p[TX_16X16][1] = tx_count_32x32p[TX_32X32];
-}
-
-void av1_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
-                                          unsigned int (*ct_16x16p)[2]) {
-  ct_16x16p[TX_4X4][0] = tx_count_16x16p[TX_4X4];
-  ct_16x16p[TX_4X4][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
-  ct_16x16p[TX_8X8][0] = tx_count_16x16p[TX_8X8];
-  ct_16x16p[TX_8X8][1] = tx_count_16x16p[TX_16X16];
-}
-
-void av1_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
-                                        unsigned int (*ct_8x8p)[2]) {
-  ct_8x8p[TX_4X4][0] = tx_count_8x8p[TX_4X4];
-  ct_8x8p[TX_4X4][1] = tx_count_8x8p[TX_8X8];
-}
+#if CONFIG_VAR_TX
+static const aom_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
+  250, 231, 212, 241, 166, 66, 241, 230, 135, 243, 154, 64, 248, 161, 63, 128,
+};
+#endif
 
 static const aom_prob default_skip_probs[SKIP_CONTEXTS] = { 192, 128, 64 };
 
 #if CONFIG_EXT_INTERP
-static const aom_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
-                                                    [SWITCHABLE_FILTERS - 1] = {
-                                                      { 235, 192, 128, 128 },
-                                                      { 36, 243, 208, 128 },
-                                                      { 34, 16, 128, 128 },
-                                                      { 36, 243, 48, 128 },
-                                                      { 34, 16, 128, 128 },
-                                                      { 149, 160, 128, 128 },
-                                                    };
-#else   // CONFIG_EXT_INTERP
+static const aom_prob default_switchable_interp_prob
+    [SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1] = {
+#if CONFIG_DUAL_FILTER
+      { 235, 192, 128, 128 }, { 36, 243, 208, 128 }, { 34, 16, 128, 128 },
+      { 36, 243, 48, 128 },   { 34, 16, 128, 128 },  { 149, 160, 128, 128 },
+
+      { 235, 192, 128, 128 }, { 36, 243, 208, 128 }, { 34, 16, 128, 128 },
+      { 36, 243, 48, 128 },   { 34, 16, 128, 128 },  { 149, 160, 128, 128 },
+
+      { 235, 192, 128, 128 }, { 36, 243, 208, 128 }, { 34, 16, 128, 128 },
+      { 36, 243, 48, 128 },   { 34, 16, 128, 128 },  { 149, 160, 128, 128 },
+
+      { 235, 192, 128, 128 }, { 36, 243, 208, 128 }, { 34, 16, 128, 128 },
+      { 36, 243, 48, 128 },   { 34, 16, 128, 128 },  { 149, 160, 128, 128 },
+#else
+      { 235, 192, 128, 128 }, { 36, 243, 208, 128 }, { 34, 16, 128, 128 },
+      { 36, 243, 48, 128 },   { 34, 16, 128, 128 },  { 149, 160, 128, 128 },
+#endif
+    };
+#else  // CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+static const aom_prob
+    default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                  [SWITCHABLE_FILTERS - 1] = {
+                                    { 235, 162 }, { 36, 255 },
+                                    { 34, 3 },    { 149, 144 },
+
+                                    { 235, 162 }, { 36, 255 },
+                                    { 34, 3 },    { 10, 3 },
+
+                                    { 235, 162 }, { 36, 255 },
+                                    { 34, 3 },    { 149, 144 },
+
+                                    { 235, 162 }, { 36, 255 },
+                                    { 34, 3 },    { 10, 3 },
+                                  };
+#else
 static const aom_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
                                                       { 235, 162 },
@@ -830,21 +1100,280 @@
                                                       { 34, 3 },
                                                       { 149, 144 },
                                                     };
+#endif
 #endif  // CONFIG_EXT_INTERP
 
-// FIXME(someone) need real defaults here
-static const aom_prob default_segment_tree_probs[SEG_TREE_PROBS] = {
-  128, 128, 128, 128, 128, 128, 128
+#if CONFIG_EXT_TX
+/* clang-format off */
+const aom_tree_index av1_ext_tx_inter_tree[EXT_TX_SETS_INTER]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  { // ToDo(yaowu): remove used entry 0.
+    0
+  }, {
+    -IDTX, 2,
+    4, 14,
+    6, 8,
+    -V_DCT, -H_DCT,
+    10, 12,
+    -V_ADST, -H_ADST,
+    -V_FLIPADST, -H_FLIPADST,
+    -DCT_DCT, 16,
+    18, 24,
+    20, 22,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    26, 28,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }, {
+    -IDTX, 2,
+    4, 6,
+    -V_DCT, -H_DCT,
+    -DCT_DCT, 8,
+    10, 16,
+    12, 14,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    18, 20,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }, {
+    -IDTX, -DCT_DCT,
+  }
 };
-// clang-format off
-static const aom_prob default_segment_pred_probs[PREDICTION_PROBS] = {
-  128, 128, 128
-};
-// clang-format on
 
-const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)] = {
-  -DCT_DCT, 2, -ADST_ADST, 4, -ADST_DCT, -DCT_ADST
+const aom_tree_index av1_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  {  // ToDo(yaowu): remove unused entry 0.
+    0
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    6, 8,
+    -V_DCT, -H_DCT,
+    -ADST_ADST, 10,
+    -ADST_DCT, -DCT_ADST,
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -ADST_ADST, 6,
+    -ADST_DCT, -DCT_ADST,
+  }
 };
+/* clang-format on */
+
+static const aom_prob
+    default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
+      {
+          // ToDo(yaowu): remove unused entry 0.
+          { 0 },
+          { 0 },
+          { 0 },
+#if EXT_TX_SIZES == 4
+          { 0 },
+#endif
+      },
+      {
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+#if EXT_TX_SIZES == 4
+          { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128,
+            128 },
+#endif
+      },
+      {
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+#if EXT_TX_SIZES == 4
+          { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+#endif
+      },
+      {
+          { 12 },
+          { 12 },
+          { 12 },
+#if EXT_TX_SIZES == 4
+          { 12 },
+#endif
+      }
+    };
+
+static const aom_prob
+    default_intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                             [TX_TYPES - 1] = {
+                               {
+                                   // ToDo(yaowu): remove unused entry 0.
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                   },
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                   },
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+#if EXT_TX_SIZES == 4
+                                   },
+                                   {
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+                                       { 0 },
+#endif
+                                   },
+                               },
+                               {
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                   },
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                   },
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+#if EXT_TX_SIZES == 4
+                                   },
+                                   {
+                                       { 8, 224, 32, 128, 64, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 9, 200, 32, 128, 64, 128 },
+                                       { 8, 8, 32, 128, 224, 128 },
+                                       { 10, 32, 32, 128, 16, 192 },
+                                       { 10, 32, 32, 128, 16, 64 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 23, 32, 128, 80, 176 },
+                                       { 10, 32, 32, 128, 16, 64 },
+#endif
+                                   },
+                               },
+                               {
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+                                   },
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+                                   },
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+#if EXT_TX_SIZES == 4
+                                   },
+                                   {
+                                       { 8, 224, 64, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 9, 200, 64, 128 },
+                                       { 8, 8, 224, 128 },
+                                       { 10, 32, 16, 192 },
+                                       { 10, 32, 16, 64 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 23, 80, 176 },
+                                       { 10, 32, 16, 64 },
+#endif
+                                   },
+                               },
+                             };
+
+#else
+
+/* clang-format off */
+const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)] = {
+  -DCT_DCT, 2,
+  -ADST_ADST, 4,
+  -ADST_DCT, -DCT_ADST
+};
+/* clang-format on */
 
 int av1_ext_tx_ind[TX_TYPES];
 int av1_ext_tx_inv[TX_TYPES];
@@ -867,6 +1396,44 @@
   { 176, 85, 128 },
   { 192, 85, 128 },
 };
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA
+static const aom_prob default_intra_filter_probs[INTRA_FILTERS + 1]
+                                                [INTRA_FILTERS - 1] = {
+                                                  { 98, 63, 60 },
+                                                  { 98, 82, 80 },
+                                                  { 94, 65, 103 },
+                                                  { 49, 25, 24 },
+                                                  { 72, 38, 50 },
+                                                };
+const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)] = {
+  -INTRA_FILTER_LINEAR,      2, -INTRA_FILTER_8TAP, 4, -INTRA_FILTER_8TAP_SHARP,
+  -INTRA_FILTER_8TAP_SMOOTH,
+};
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_FILTER_INTRA
+static const aom_prob default_filter_intra_probs[2] = { 230, 230 };
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_SUPERTX
+static const aom_prob default_supertx_prob[PARTITION_SUPERTX_CONTEXTS]
+                                          [TX_SIZES] = {
+                                            { 1, 160, 160, 170 },
+                                            { 1, 200, 200, 210 },
+                                          };
+#endif  // CONFIG_SUPERTX
+
+// FIXME(someone) need real defaults here
+static const aom_prob default_segment_tree_probs[SEG_TREE_PROBS] = {
+  128, 128, 128, 128, 128, 128, 128
+};
+// clang-format off
+static const aom_prob default_segment_pred_probs[PREDICTION_PROBS] = {
+  128, 128, 128
+};
+// clang-format on
 
 static void init_mode_probs(FRAME_CONTEXT *fc) {
   av1_copy(fc->uv_mode_prob, default_uv_probs);
@@ -875,29 +1442,55 @@
   av1_copy(fc->partition_prob, default_partition_probs);
   av1_copy(fc->intra_inter_prob, default_intra_inter_p);
   av1_copy(fc->comp_inter_prob, default_comp_inter_p);
-#if CONFIG_EXT_REFS
-  av1_copy(fc->comp_fwdref_prob, default_comp_fwdref_p);
-  av1_copy(fc->comp_bwdref_prob, default_comp_bwdref_p);
-#else
   av1_copy(fc->comp_ref_prob, default_comp_ref_p);
+#if CONFIG_EXT_REFS
+  av1_copy(fc->comp_bwdref_prob, default_comp_bwdref_p);
 #endif  // CONFIG_EXT_REFS
   av1_copy(fc->single_ref_prob, default_single_ref_p);
-  fc->tx_probs = default_tx_probs;
+  av1_copy(fc->tx_size_probs, default_tx_size_prob);
+#if CONFIG_VAR_TX
+  av1_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  av1_copy(fc->rect_tx_prob, default_rect_tx_prob);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+#endif
   av1_copy(fc->skip_probs, default_skip_probs);
 #if CONFIG_REF_MV
   av1_copy(fc->newmv_prob, default_newmv_prob);
   av1_copy(fc->zeromv_prob, default_zeromv_prob);
   av1_copy(fc->refmv_prob, default_refmv_prob);
   av1_copy(fc->drl_prob, default_drl_prob);
-#endif
+#if CONFIG_EXT_INTER
+  fc->new2mv_prob = default_new2mv_prob;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
   av1_copy(fc->inter_mode_probs, default_inter_mode_probs);
-#if CONFIG_MOTION_VAR
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   av1_copy(fc->motion_mode_prob, default_motion_mode_prob);
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+  av1_copy(fc->inter_compound_mode_probs, default_inter_compound_mode_probs);
+  av1_copy(fc->interintra_prob, default_interintra_prob);
+  av1_copy(fc->interintra_mode_prob, default_interintra_mode_prob);
+  av1_copy(fc->wedge_interintra_prob, default_wedge_interintra_prob);
+  av1_copy(fc->wedge_interinter_prob, default_wedge_interinter_prob);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_SUPERTX
+  av1_copy(fc->supertx_prob, default_supertx_prob);
+#endif  // CONFIG_SUPERTX
   av1_copy(fc->seg.tree_probs, default_segment_tree_probs);
   av1_copy(fc->seg.pred_probs, default_segment_pred_probs);
-  av1_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
+#if CONFIG_EXT_INTRA
+  av1_copy(fc->intra_filter_probs, default_intra_filter_probs);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  av1_copy(fc->filter_intra_probs, default_filter_intra_probs);
+#endif  // CONFIG_FILTER_INTRA
   av1_copy(fc->inter_ext_tx_prob, default_inter_ext_tx_prob);
+  av1_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
+#if CONFIG_LOOP_RESTORATION
+  av1_copy(fc->switchable_restore_prob, default_switchable_restore_prob);
+#endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_DAALA_EC
   av1_tree_to_cdf_1D(av1_intra_mode_tree, fc->y_mode_prob, fc->y_mode_cdf,
                      BLOCK_SIZE_GROUPS);
@@ -972,7 +1565,7 @@
 #if CONFIG_EXT_INTERP
 const aom_tree_index av1_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] =
     {
-      -EIGHTTAP,
+      -EIGHTTAP_REGULAR,
       2,
       4,
       6,
@@ -981,9 +1574,9 @@
       -MULTITAP_SHARP,
       -MULTITAP_SHARP2,
     };
-#else   // CONFIG_EXT_INTERP
+#else
 const aom_tree_index av1_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] =
-    { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP };
+    { -EIGHTTAP_REGULAR, 2, -EIGHTTAP_SMOOTH, -MULTITAP_SHARP };
 #endif  // CONFIG_EXT_INTERP
 
 void av1_adapt_inter_frame_probs(AV1_COMMON *cm) {
@@ -993,55 +1586,107 @@
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
-                                                  counts->intra_inter[i]);
+    fc->intra_inter_prob[i] = av1_mode_mv_merge_probs(
+        pre_fc->intra_inter_prob[i], counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] =
-        mode_mv_merge_probs(pre_fc->comp_inter_prob[i], counts->comp_inter[i]);
+    fc->comp_inter_prob[i] = av1_mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+                                                     counts->comp_inter[i]);
 
 #if CONFIG_EXT_REFS
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < (FWD_REFS - 1); j++)
-      fc->comp_fwdref_prob[i][j] = mode_mv_merge_probs(
-          pre_fc->comp_fwdref_prob[i][j], counts->comp_fwdref[i][j]);
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
+                                                    counts->comp_ref[i][j]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < (BWD_REFS - 1); j++)
       fc->comp_bwdref_prob[i][j] = mode_mv_merge_probs(
           pre_fc->comp_bwdref_prob[i][j], counts->comp_bwdref[i][j]);
 #else
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] =
-        mode_mv_merge_probs(pre_fc->comp_ref_prob[i], counts->comp_ref[i]);
+    for (j = 0; j < (COMP_REFS - 1); j++)
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i][j],
+                                                    counts->comp_ref[i][j]);
 #endif  // CONFIG_EXT_REFS
 
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < (SINGLE_REFS - 1); j++)
-      fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+      fc->single_ref_prob[i][j] = av1_mode_mv_merge_probs(
           pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
 #if CONFIG_REF_MV
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
     fc->newmv_prob[i] =
-        mode_mv_merge_probs(pre_fc->newmv_prob[i], counts->newmv_mode[i]);
+        av1_mode_mv_merge_probs(pre_fc->newmv_prob[i], counts->newmv_mode[i]);
   for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
     fc->zeromv_prob[i] =
-        mode_mv_merge_probs(pre_fc->zeromv_prob[i], counts->zeromv_mode[i]);
+        av1_mode_mv_merge_probs(pre_fc->zeromv_prob[i], counts->zeromv_mode[i]);
   for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
     fc->refmv_prob[i] =
-        mode_mv_merge_probs(pre_fc->refmv_prob[i], counts->refmv_mode[i]);
+        av1_mode_mv_merge_probs(pre_fc->refmv_prob[i], counts->refmv_mode[i]);
+
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
     fc->drl_prob[i] =
-        mode_mv_merge_probs(pre_fc->drl_prob[i], counts->drl_mode[i]);
+        av1_mode_mv_merge_probs(pre_fc->drl_prob[i], counts->drl_mode[i]);
+#if CONFIG_EXT_INTER
+  fc->new2mv_prob =
+      av1_mode_mv_merge_probs(pre_fc->new2mv_prob, counts->new2mv_mode);
+#endif  // CONFIG_EXT_INTER
 #else
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
     aom_tree_merge_probs(av1_inter_mode_tree, pre_fc->inter_mode_probs[i],
                          counts->inter_mode[i], fc->inter_mode_probs[i]);
 #endif
-#if CONFIG_MOTION_VAR
-  for (i = 0; i < BLOCK_SIZES; i++)
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
     aom_tree_merge_probs(av1_motion_mode_tree, pre_fc->motion_mode_prob[i],
                          counts->motion_mode[i], fc->motion_mode_prob[i]);
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_SUPERTX
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    for (j = 1; j < TX_SIZES; ++j) {
+      fc->supertx_prob[i][j] = av1_mode_mv_merge_probs(
+          pre_fc->supertx_prob[i][j], counts->supertx[i][j]);
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_INTER
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    aom_tree_merge_probs(
+        av1_inter_compound_mode_tree, pre_fc->inter_compound_mode_probs[i],
+        counts->inter_compound_mode[i], fc->inter_compound_mode_probs[i]);
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+    if (is_interintra_allowed_bsize_group(i))
+      fc->interintra_prob[i] = av1_mode_mv_merge_probs(
+          pre_fc->interintra_prob[i], counts->interintra[i]);
+  }
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+    aom_tree_merge_probs(
+        av1_interintra_mode_tree, pre_fc->interintra_mode_prob[i],
+        counts->interintra_mode[i], fc->interintra_mode_prob[i]);
+  }
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
+      fc->wedge_interintra_prob[i] = av1_mode_mv_merge_probs(
+          pre_fc->wedge_interintra_prob[i], counts->wedge_interintra[i]);
+  }
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    if (is_interinter_wedge_used(i))
+      fc->wedge_interinter_prob[i] = av1_mode_mv_merge_probs(
+          pre_fc->wedge_interinter_prob[i], counts->wedge_interinter[i]);
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 0; i < MAX_TX_DEPTH; ++i) {
+      fc->rect_tx_prob[i] =
+          av1_mode_mv_merge_probs(pre_fc->rect_tx_prob[i], counts->rect_tx[i]);
+    }
+  }
+#endif  // CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     aom_tree_merge_probs(av1_intra_mode_tree, pre_fc->y_mode_prob[i],
@@ -1062,43 +1707,52 @@
 }
 
 void av1_adapt_intra_frame_probs(AV1_COMMON *cm) {
-  int i;
+  int i, j;
   FRAME_CONTEXT *fc = cm->fc;
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   const FRAME_COUNTS *counts = &cm->counts;
 
   if (cm->tx_mode == TX_MODE_SELECT) {
-    int j;
-    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-      av1_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = TX_4X4; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] =
-            mode_mv_merge_probs(pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
-
-      av1_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
-                                           branch_ct_16x16p);
-      for (j = TX_4X4; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
-
-      av1_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
-                                           branch_ct_32x32p);
-      for (j = TX_4X4; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
+    for (i = 0; i < MAX_TX_DEPTH; ++i) {
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        aom_tree_merge_probs(av1_tx_size_tree[i], pre_fc->tx_size_probs[i][j],
+                             counts->tx_size[i][j], fc->tx_size_probs[i][j]);
     }
   }
 
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
+      fc->txfm_partition_prob[i] = av1_mode_mv_merge_probs(
+          pre_fc->txfm_partition_prob[i], counts->txfm_partition[i]);
+  }
+#endif
+
   for (i = 0; i < SKIP_CONTEXTS; ++i)
     fc->skip_probs[i] =
-        mode_mv_merge_probs(pre_fc->skip_probs[i], counts->skip[i]);
+        av1_mode_mv_merge_probs(pre_fc->skip_probs[i], counts->skip[i]);
 
+#if CONFIG_EXT_TX
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    int j;
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        aom_tree_merge_probs(
+            av1_ext_tx_inter_tree[s], pre_fc->inter_ext_tx_prob[s][i],
+            counts->inter_ext_tx[s][i], fc->inter_ext_tx_prob[s][i]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j)
+          aom_tree_merge_probs(
+              av1_ext_tx_intra_tree[s], pre_fc->intra_ext_tx_prob[s][i][j],
+              counts->intra_ext_tx[s][i][j], fc->intra_ext_tx_prob[s][i][j]);
+      }
+    }
+  }
+#else
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     for (j = 0; j < TX_TYPES; ++j) {
       aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->intra_ext_tx_prob[i][j],
                            counts->intra_ext_tx[i][j],
@@ -1109,11 +1763,12 @@
     aom_tree_merge_probs(av1_ext_tx_tree, pre_fc->inter_ext_tx_prob[i],
                          counts->inter_ext_tx[i], fc->inter_ext_tx_prob[i]);
   }
+#endif  // CONFIG_EXT_TX
 
   if (cm->seg.temporal_update) {
     for (i = 0; i < PREDICTION_PROBS; i++)
-      fc->seg.pred_probs[i] =
-          mode_mv_merge_probs(pre_fc->seg.pred_probs[i], counts->seg.pred[i]);
+      fc->seg.pred_probs[i] = av1_mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
+                                                      counts->seg.pred[i]);
 
     aom_tree_merge_probs(av1_segment_tree, pre_fc->seg.tree_probs,
                          counts->seg.tree_mispred, fc->seg.tree_probs);
@@ -1126,16 +1781,35 @@
     aom_tree_merge_probs(av1_intra_mode_tree, pre_fc->uv_mode_prob[i],
                          counts->uv_mode[i], fc->uv_mode_prob[i]);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[0],
+                       counts->partition[0], fc->partition_prob[0]);
+  for (i = 1; i < PARTITION_CONTEXTS; i++)
+    aom_tree_merge_probs(av1_ext_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+#else
   for (i = 0; i < PARTITION_CONTEXTS; i++) {
     aom_tree_merge_probs(av1_partition_tree, pre_fc->partition_prob[i],
                          counts->partition[i], fc->partition_prob[i]);
   }
-
+#endif  // CONFIG_EXT_PARTITION_TYPES
 #if CONFIG_DELTA_Q
   for (i = 0; i < DELTA_Q_CONTEXTS; ++i)
     fc->delta_q_prob[i] =
         mode_mv_merge_probs(pre_fc->delta_q_prob[i], counts->delta_q[i]);
 #endif
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < INTRA_FILTERS + 1; ++i) {
+    aom_tree_merge_probs(av1_intra_filter_tree, pre_fc->intra_filter_probs[i],
+                         counts->intra_filter[i], fc->intra_filter_probs[i]);
+  }
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    fc->filter_intra_probs[i] = av1_mode_mv_merge_probs(
+        pre_fc->filter_intra_probs[i], counts->filter_intra[i]);
+  }
+#endif  // CONFIG_FILTER_INTRA
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -1145,13 +1819,11 @@
   lf->ref_deltas[INTRA_FRAME] = 1;
   lf->ref_deltas[LAST_FRAME] = 0;
 #if CONFIG_EXT_REFS
-  lf->ref_deltas[LAST2_FRAME] = 0;
-  lf->ref_deltas[LAST3_FRAME] = 0;
+  lf->ref_deltas[LAST2_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[LAST3_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[BWDREF_FRAME] = lf->ref_deltas[LAST_FRAME];
 #endif  // CONFIG_EXT_REFS
   lf->ref_deltas[GOLDEN_FRAME] = -1;
-#if CONFIG_EXT_REFS
-  lf->ref_deltas[BWDREF_FRAME] = -1;
-#endif  // CONFIG_EXT_REFS
   lf->ref_deltas[ALTREF_FRAME] = -1;
 
   lf->mode_deltas[0] = 0;
@@ -1180,6 +1852,14 @@
 
   // To force update of the sharpness
   lf->last_sharpness_level = -1;
+#if CONFIG_LOOP_RESTORATION
+  if (cm->rst_info.bilateral_info) {
+    int s;
+    for (i = 0; i < cm->rst_internal.ntiles; ++i)
+      for (s = 0; s < BILATERAL_SUBTILES; ++s)
+        cm->rst_info.bilateral_info[i].level[s] = -1;
+  }
+#endif  // CONFIG_LOOP_RESTORATION
 
   av1_default_coef_probs(cm);
   init_mode_probs(cm->fc);
@@ -1187,7 +1867,6 @@
 #if CONFIG_ADAPT_SCAN
   av1_init_scan_order(cm);
 #endif
-
   cm->fc->initialized = 1;
 
   if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||

diff --git a/av1/common/entropymode.h b/av1/common/entropymode.h
index 97d24a7..1d95cdc 100644
--- a/av1/common/entropymode.h
+++ b/av1/common/entropymode.h

@@ -27,11 +27,14 @@
 #define TX_SIZE_CONTEXTS 2
 
 #define INTER_OFFSET(mode) ((mode)-NEARESTMV)
+#if CONFIG_EXT_INTER
+#define INTER_COMPOUND_OFFSET(mode) ((mode)-NEAREST_NEARESTMV)
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_PALETTE
 #define PALETTE_COLOR_CONTEXTS 16
 #define PALETTE_MAX_SIZE 8
-#define PALETTE_BLOCK_SIZES (BLOCK_64X64 - BLOCK_8X8 + 1)
+#define PALETTE_BLOCK_SIZES (BLOCK_LARGEST - BLOCK_8X8 + 1)
 #define PALETTE_Y_MODE_CONTEXTS 3
 #define PALETTE_MAX_BLOCK_SIZE (64 * 64)
 #endif  // CONFIG_PALETTE
@@ -44,19 +47,6 @@
   const int16_t *neighbors;
 } SCAN_ORDER;
 
-struct tx_probs {
-  aom_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  aom_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  aom_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
-};
-
-struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  unsigned int tx_totals[TX_SIZES];
-};
-
 struct seg_counts {
   unsigned int tree_total[MAX_SEGMENTS];
   unsigned int tree_mispred[MAX_SEGMENTS];
@@ -66,7 +56,11 @@
 typedef struct frame_contexts {
   aom_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
   aom_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_EXT_PARTITION_TYPES
+  aom_prob partition_prob[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1];
+#else
   aom_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+#endif
   av1_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
 #if CONFIG_EC_MULTISYMBOL
   coeff_cdf_model coef_cdfs[TX_SIZES][PLANE_TYPES];
@@ -97,39 +91,79 @@
   int16_t nb_32X32[TX_TYPES][(1024 + 1) * 2];
 
   SCAN_ORDER sc[TX_SIZES][TX_TYPES];
-#endif
+#endif  // CONFIG_ADAPT_SCAN
 
 #if CONFIG_REF_MV
   aom_prob newmv_prob[NEWMV_MODE_CONTEXTS];
   aom_prob zeromv_prob[ZEROMV_MODE_CONTEXTS];
   aom_prob refmv_prob[REFMV_MODE_CONTEXTS];
   aom_prob drl_prob[DRL_MODE_CONTEXTS];
-#endif
+
+#if CONFIG_EXT_INTER
+  aom_prob new2mv_prob;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
 
   aom_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
-#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+  aom_prob inter_compound_mode_probs[INTER_MODE_CONTEXTS]
+                                    [INTER_COMPOUND_MODES - 1];
+  aom_prob interintra_prob[BLOCK_SIZE_GROUPS];
+  aom_prob interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1];
+  aom_prob wedge_interintra_prob[BLOCK_SIZES];
+  aom_prob wedge_interinter_prob[BLOCK_SIZES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   aom_prob motion_mode_prob[BLOCK_SIZES][MOTION_MODES - 1];
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   aom_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
   aom_prob comp_inter_prob[COMP_INTER_CONTEXTS];
   aom_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS - 1];
 #if CONFIG_EXT_REFS
-  aom_prob comp_fwdref_prob[REF_CONTEXTS][FWD_REFS - 1];
+  aom_prob comp_ref_prob[REF_CONTEXTS][FWD_REFS - 1];
   aom_prob comp_bwdref_prob[REF_CONTEXTS][BWD_REFS - 1];
 #else
-  aom_prob comp_ref_prob[REF_CONTEXTS];
+  aom_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS - 1];
 #endif  // CONFIG_EXT_REFS
-  struct tx_probs tx_probs;
+  aom_prob tx_size_probs[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][MAX_TX_DEPTH];
+#if CONFIG_VAR_TX
+  aom_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  // TODO(yuec) make this flag harmonize with the original syntax
+  aom_prob rect_tx_prob[TX_SIZES - 1];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+#endif
   aom_prob skip_probs[SKIP_CONTEXTS];
 #if CONFIG_REF_MV
   nmv_context nmvc[NMV_CONTEXTS];
 #else
   nmv_context nmvc;
 #endif
-  struct segmentation_probs seg;
+  int initialized;
+#if CONFIG_EXT_TX
+  aom_prob inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1];
+  aom_prob intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                            [TX_TYPES - 1];
+#else
   aom_prob intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1];
   aom_prob inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1];
-  int initialized;
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  aom_prob supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct segmentation_probs seg;
+#if CONFIG_EXT_INTRA
+  aom_prob intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  aom_prob filter_intra_probs[PLANE_TYPES];
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_GLOBAL_MOTION
+  aom_prob global_motion_types_prob[GLOBAL_MOTION_TYPES - 1];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_LOOP_RESTORATION
+  aom_prob switchable_restore_prob[RESTORE_SWITCHABLE_TYPES - 1];
+#endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_DAALA_EC
   aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][INTRA_MODES];
   aom_cdf_prob uv_mode_cdf[INTRA_MODES][INTRA_MODES];
@@ -146,16 +180,21 @@
 } FRAME_CONTEXT;
 
 typedef struct FRAME_COUNTS {
+  // Note: This structure should only contain 'unsigned int' fields, or
+  // aggregates built solely from 'unsigned int' fields/elements
   unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
   unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+#if CONFIG_EXT_PARTITION_TYPES
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
   unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+#endif
   av1_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
   unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
                          [COEFF_CONTEXTS];
   unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
-
 #if CONFIG_ADAPT_SCAN
   unsigned int non_zero_count_4X4[TX_TYPES][16];
   unsigned int non_zero_count_8X8[TX_TYPES][64];
@@ -169,34 +208,73 @@
   unsigned int zeromv_mode[ZEROMV_MODE_CONTEXTS][2];
   unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
   unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+#if CONFIG_EXT_INTER
+  unsigned int new2mv_mode[2];
+#endif  // CONFIG_EXT_INTER
 #endif
 
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
-#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES][2];
+  unsigned int wedge_interinter[BLOCK_SIZES][2];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   unsigned int motion_mode[BLOCK_SIZES][MOTION_MODES];
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
 #if CONFIG_EXT_REFS
-  unsigned int comp_fwdref[REF_CONTEXTS][FWD_REFS - 1][2];
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
   unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
 #else
-  unsigned int comp_ref[REF_CONTEXTS][2];
+  unsigned int comp_ref[REF_CONTEXTS][COMP_REFS - 1][2];
 #endif  // CONFIG_EXT_REFS
-  struct tx_counts tx;
+  // TODO(any): tx_size_totals is only used by the encoder to decide whether
+  // to use forward updates for the coeff probs, and as such it does not really
+  // belong into this structure.
+  unsigned int tx_size_totals[TX_SIZES];
+  unsigned int tx_size[MAX_TX_DEPTH][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_VAR_TX
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  unsigned int rect_tx[TX_SIZES - 1][2];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+#endif
   unsigned int skip[SKIP_CONTEXTS][2];
 #if CONFIG_REF_MV
   nmv_context_counts mv[NMV_CONTEXTS];
 #else
   nmv_context_counts mv;
 #endif
-  struct seg_counts seg;
 #if CONFIG_DELTA_Q
   unsigned int delta_q[DELTA_Q_CONTEXTS][2];
 #endif
+#if CONFIG_EXT_TX
+#if CONFIG_RECT_TX
+  unsigned int tx_size_implied[TX_SIZES][TX_SIZES];
+#endif  // CONFIG_RECT_TX
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+#else
   unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
   unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  unsigned int supertx[PARTITION_SUPERTX_CONTEXTS][TX_SIZES][2];
+  unsigned int supertx_size[TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct seg_counts seg;
+#if CONFIG_EXT_INTRA
+  unsigned int intra_filter[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  unsigned int filter_intra[PLANE_TYPES][2];
+#endif  // CONFIG_FILTER_INTRA
 } FRAME_COUNTS;
 
 extern const aom_prob av1_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
@@ -204,7 +282,6 @@
 #if CONFIG_DAALA_EC
 extern aom_cdf_prob av1_kf_y_mode_cdf[INTRA_MODES][INTRA_MODES][INTRA_MODES];
 #endif
-
 #if CONFIG_PALETTE
 extern const aom_prob av1_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES]
                                                      [PALETTE_Y_MODE_CONTEXTS];
@@ -229,19 +306,46 @@
 extern int av1_inter_mode_ind[INTER_MODES];
 extern int av1_inter_mode_inv[INTER_MODES];
 #endif
-#if CONFIG_MOTION_VAR
-extern const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)];
-#endif  // CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+extern const aom_tree_index
+    av1_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)];
+extern const aom_tree_index
+    av1_inter_compound_mode_tree[TREE_SIZE(INTER_COMPOUND_MODES)];
+#endif  // CONFIG_EXT_INTER
 extern const aom_tree_index av1_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+#if CONFIG_EXT_PARTITION_TYPES
+extern const aom_tree_index
+    av1_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)];
+#endif
 extern const aom_tree_index
     av1_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)];
-
 #if CONFIG_PALETTE
 extern const aom_tree_index av1_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
 extern const aom_tree_index av1_palette_color_tree[PALETTE_MAX_SIZE - 1]
                                                   [TREE_SIZE(PALETTE_COLORS)];
 #endif  // CONFIG_PALETTE
+extern const aom_tree_index av1_tx_size_tree[MAX_TX_DEPTH][TREE_SIZE(TX_SIZES)];
+#if CONFIG_EXT_INTRA
+extern const aom_tree_index av1_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_TX
+extern const aom_tree_index av1_ext_tx_inter_tree[EXT_TX_SETS_INTER]
+                                                 [TREE_SIZE(TX_TYPES)];
+extern const aom_tree_index av1_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
+                                                 [TREE_SIZE(TX_TYPES)];
+#else
+extern const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+extern const aom_tree_index av1_motion_mode_tree[TREE_SIZE(MOTION_MODES)];
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
+#if CONFIG_LOOP_RESTORATION
+#define RESTORE_NONE_BILATERAL_PROB 16
+#define RESTORE_NONE_WIENER_PROB 64
+extern const aom_tree_index
+    av1_switchable_restore_tree[TREE_SIZE(RESTORE_SWITCHABLE_TYPES)];
+#endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_DAALA_EC
 extern int av1_switchable_interp_ind[SWITCHABLE_FILTERS];
 extern int av1_switchable_interp_inv[SWITCHABLE_FILTERS];
@@ -253,15 +357,6 @@
 
 void av1_adapt_intra_frame_probs(struct AV1Common *cm);
 void av1_adapt_inter_frame_probs(struct AV1Common *cm);
-
-void av1_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
-                                          unsigned int (*ct_32x32p)[2]);
-void av1_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
-                                          unsigned int (*ct_16x16p)[2]);
-void av1_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
-                                        unsigned int (*ct_8x8p)[2]);
-
-extern const aom_tree_index av1_ext_tx_tree[TREE_SIZE(TX_TYPES)];
 #if CONFIG_DAALA_EC
 extern int av1_ext_tx_ind[TX_TYPES];
 extern int av1_ext_tx_inv[TX_TYPES];

diff --git a/av1/common/entropymv.c b/av1/common/entropymv.c
index 43aed89..029f9f6 100644
--- a/av1/common/entropymv.c
+++ b/av1/common/entropymv.c

@@ -127,6 +127,16 @@
   9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
 };
 
+#if CONFIG_GLOBAL_MOTION
+const aom_tree_index
+    av1_global_motion_types_tree[TREE_SIZE(GLOBAL_MOTION_TYPES)] = {
+      -GLOBAL_ZERO, 2, -GLOBAL_TRANSLATION, 4, -GLOBAL_ROTZOOM, -GLOBAL_AFFINE
+    };
+
+static const aom_prob default_global_motion_types_prob[GLOBAL_MOTION_TYPES -
+                                                       1] = { 224, 128, 128 };
+#endif  // CONFIG_GLOBAL_MOTION
+
 static INLINE int mv_class_base(MV_CLASS_TYPE c) {
   return c ? CLASS0_SIZE << (c + 2) : 0;
 }
@@ -157,13 +167,13 @@
   if (c == MV_CLASS_0) {
     comp_counts->class0[d] += incr;
     comp_counts->class0_fp[d][f] += incr;
-    comp_counts->class0_hp[e] += usehp * incr;
+    if (usehp) comp_counts->class0_hp[e] += incr;
   } else {
     int i;
     int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i) comp_counts->bits[i][((d >> i) & 1)] += incr;
     comp_counts->fp[f] += incr;
-    comp_counts->hp[e] += usehp * incr;
+    if (usehp) comp_counts->hp[e] += incr;
   }
 }
 
@@ -192,20 +202,19 @@
 
     aom_tree_merge_probs(av1_mv_joint_tree, pre_fc->joints, counts->joints,
                          fc->joints);
-
     for (i = 0; i < 2; ++i) {
       nmv_component *comp = &fc->comps[i];
       const nmv_component *pre_comp = &pre_fc->comps[i];
       const nmv_component_counts *c = &counts->comps[i];
 
-      comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+      comp->sign = av1_mode_mv_merge_probs(pre_comp->sign, c->sign);
       aom_tree_merge_probs(av1_mv_class_tree, pre_comp->classes, c->classes,
                            comp->classes);
       aom_tree_merge_probs(av1_mv_class0_tree, pre_comp->class0, c->class0,
                            comp->class0);
 
       for (j = 0; j < MV_OFFSET_BITS; ++j)
-        comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
+        comp->bits[j] = av1_mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
       for (j = 0; j < CLASS0_SIZE; ++j)
         aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->class0_fp[j],
@@ -215,8 +224,8 @@
 
       if (allow_hp) {
         comp->class0_hp =
-            mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
-        comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
+            av1_mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+        comp->hp = av1_mode_mv_merge_probs(pre_comp->hp, c->hp);
       }
     }
   }
@@ -233,14 +242,14 @@
     const nmv_component *pre_comp = &pre_fc->comps[i];
     const nmv_component_counts *c = &counts->comps[i];
 
-    comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+    comp->sign = av1_mode_mv_merge_probs(pre_comp->sign, c->sign);
     aom_tree_merge_probs(av1_mv_class_tree, pre_comp->classes, c->classes,
                          comp->classes);
     aom_tree_merge_probs(av1_mv_class0_tree, pre_comp->class0, c->class0,
                          comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
+      comp->bits[j] = av1_mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
       aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->class0_fp[j],
@@ -249,8 +258,9 @@
     aom_tree_merge_probs(av1_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
-      comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
+      comp->class0_hp =
+          av1_mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = av1_mode_mv_merge_probs(pre_comp->hp, c->hp);
     }
   }
 #endif
@@ -285,4 +295,7 @@
   av1_set_mv_cdfs(&cm->fc->nmvc);
 #endif
 #endif
+#if CONFIG_GLOBAL_MOTION
+  av1_copy(cm->fc->global_motion_types_prob, default_global_motion_types_prob);
+#endif  // CONFIG_GLOBAL_MOTION
 }

diff --git a/av1/common/entropymv.h b/av1/common/entropymv.h
index ca9de79..1ebbdb2 100644
--- a/av1/common/entropymv.h
+++ b/av1/common/entropymv.h

@@ -134,7 +134,10 @@
 } nmv_context_counts;
 
 void av1_inc_mv(const MV *mv, nmv_context_counts *mvctx, const int usehp);
-
+#if CONFIG_GLOBAL_MOTION
+extern const aom_tree_index
+    av1_global_motion_types_tree[TREE_SIZE(GLOBAL_MOTION_TYPES)];
+#endif  // CONFIG_GLOBAL_MOTION
 #if CONFIG_EC_MULTISYMBOL
 void av1_set_mv_cdfs(nmv_context *ctx);
 #endif

diff --git a/av1/common/enums.h b/av1/common/enums.h
index 5cf898c..6c0eb3d 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h

@@ -13,6 +13,7 @@
 #define AV1_COMMON_ENUMS_H_
 
 #include "./aom_config.h"
+#include "aom/aom_codec.h"
 #include "aom/aom_integer.h"
 
 #ifdef __cplusplus
@@ -21,11 +22,18 @@
 
 #undef MAX_SB_SIZE
 
-// Pixels per max superblock size
+// Max superblock size
+#if CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE_LOG2 7
+#else
 #define MAX_SB_SIZE_LOG2 6
+#endif  // CONFIG_EXT_PARTITION
 #define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
 #define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
 // Pixels per Mode Info (MI) unit
 #define MI_SIZE_LOG2 3
 #define MI_SIZE (1 << MI_SIZE_LOG2)
@@ -34,8 +42,25 @@
 #define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
 #define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
 
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
 // Mask to extract MI offset within max MIB
 #define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
+#define MAX_MIB_MASK_2 (MAX_MIB_SIZE * 2 - 1)
+
+// Maximum number of tile rows and tile columns
+#if CONFIG_EXT_TILE
+#define MAX_TILE_ROWS 1024
+#define MAX_TILE_COLS 1024
+#else
+#define MAX_TILE_ROWS 4
+#define MAX_TILE_COLS 64
+#endif  // CONFIG_EXT_TILE
+
+#if CONFIG_VAR_TX
+#define MAX_VARTX_DEPTH 2
+#endif
 
 // Bitstream profiles indicated by 2-3 bits in the uncompressed header.
 // 00: Profile 0.  8-bit 4:2:0 only.
@@ -52,52 +77,99 @@
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
-#define BLOCK_4X4 0
-#define BLOCK_4X8 1
-#define BLOCK_8X4 2
-#define BLOCK_8X8 3
-#define BLOCK_8X16 4
-#define BLOCK_16X8 5
-#define BLOCK_16X16 6
-#define BLOCK_16X32 7
-#define BLOCK_32X16 8
-#define BLOCK_32X32 9
-#define BLOCK_32X64 10
-#define BLOCK_64X32 11
-#define BLOCK_64X64 12
-#define BLOCK_SIZES 13
-#define BLOCK_INVALID BLOCK_SIZES
-typedef uint8_t BLOCK_SIZE;
+// Note: Some enums use the attribute 'packed' to use smallest possible integer
+// type, so that we can save memory when they are used in structs/arrays.
 
-typedef enum PARTITION_TYPE {
+typedef enum ATTRIBUTE_PACKED {
+  BLOCK_4X4,
+  BLOCK_4X8,
+  BLOCK_8X4,
+  BLOCK_8X8,
+  BLOCK_8X16,
+  BLOCK_16X8,
+  BLOCK_16X16,
+  BLOCK_16X32,
+  BLOCK_32X16,
+  BLOCK_32X32,
+  BLOCK_32X64,
+  BLOCK_64X32,
+  BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+  BLOCK_64X128,
+  BLOCK_128X64,
+  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
+
+  BLOCK_SIZES,
+  BLOCK_INVALID = BLOCK_SIZES,
+  BLOCK_LARGEST = (BLOCK_SIZES - 1)
+} BLOCK_SIZE;
+
+typedef enum {
   PARTITION_NONE,
   PARTITION_HORZ,
   PARTITION_VERT,
   PARTITION_SPLIT,
-  PARTITION_TYPES,
-  PARTITION_INVALID = PARTITION_TYPES
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_HORZ_A,  // HORZ split and the left partition is split again
+  PARTITION_HORZ_B,  // HORZ split and the right partition is split again
+  PARTITION_VERT_A,  // VERT split and the top partition is split again
+  PARTITION_VERT_B,  // VERT split and the bottom partition is split again
+  EXT_PARTITION_TYPES,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPES = PARTITION_SPLIT + 1,
+  PARTITION_INVALID = 255
 } PARTITION_TYPE;
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET 4  // number of probability models per block size
+#if CONFIG_EXT_PARTITION
+#define PARTITION_CONTEXTS (5 * PARTITION_PLOFFSET)
+#else
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#endif  // CONFIG_EXT_PARTITION
 
 // block transform size
-typedef uint8_t TX_SIZE;
+typedef enum ATTRIBUTE_PACKED {
 #if CONFIG_CB4X4
-#define TX_2X2 ((TX_SIZE)0)    // 2x2 transform
-#define TX_4X4 ((TX_SIZE)1)    // 4x4 transform
-#define TX_8X8 ((TX_SIZE)2)    // 8x8 transform
-#define TX_16X16 ((TX_SIZE)3)  // 16x16 transform
-#define TX_32X32 ((TX_SIZE)4)  // 32x32 transform
-#define TX_SIZES ((TX_SIZE)5)
-#else
-#define TX_4X4 ((TX_SIZE)0)    // 4x4 transform
-#define TX_8X8 ((TX_SIZE)1)    // 8x8 transform
-#define TX_16X16 ((TX_SIZE)2)  // 16x16 transform
-#define TX_32X32 ((TX_SIZE)3)  // 32x32 transform
-#define TX_SIZES ((TX_SIZE)4)
+  TX_2X2,  // 2x2 transform
 #endif
+  TX_4X4,    // 4x4 transform
+  TX_8X8,    // 8x8 transform
+  TX_16X16,  // 16x16 transform
+  TX_32X32,  // 32x32 transform
+#if CONFIG_TX64X64
+  TX_64X64,           // 64x64 transform
+#endif                // CONFIG_TX64X64
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+#if 0                 // CONFIG_TX64X64
+  // TODO(debargha): To be enabled later
+  TX_32X64,                 // 32x64 transform
+  TX_64X32,                 // 64x32 transform
+#endif                // CONFIG_TX64X64
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_INVALID = 255    // Invalid transform size
+} TX_SIZE;
+
+#define MAX_TX_DEPTH (TX_32X32 - TX_4X4)
+
+#define MAX_TX_SIZE_LOG2 (5 + CONFIG_TX64X64)
+#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
+#define MIN_TX_SIZE_LOG2 2
+#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Number of maxium size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
+#define MAX_NUM_TXB (1 << (MAX_SB_SIZE_LOG2 - MIN_TX_SIZE_LOG2))
 
 // frame transform mode
 typedef enum {
@@ -109,6 +181,15 @@
   TX_MODES = 5,
 } TX_MODE;
 
+// 1D tx types
+typedef enum {
+  DCT_1D = 0,
+  ADST_1D = 1,
+  FLIPADST_1D = 2,
+  IDTX_1D = 3,
+  TX_TYPES_1D = 4,
+} TX_TYPE_1D;
+
 typedef enum {
   DCT_DCT = 0,    // DCT  in both horizontal and vertical
   ADST_DCT = 1,   // ADST in vertical, DCT in horizontal
@@ -128,15 +209,20 @@
   V_FLIPADST = 14,
   H_FLIPADST = 15,
 #endif  // CONFIG_EXT_TX
-  // TODO(sarahparker) this is temporary until EXT_TX is fully implemented
-  TX_TYPES = 4,
+  TX_TYPES,
 } TX_TYPE;
 
+#if CONFIG_EXT_TX
+#define EXT_TX_SIZES 4       // number of sizes that use extended transforms
+#define EXT_TX_SETS_INTER 4  // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA 3  // Sets of transform selections for INTRA
+#else
 #if CONFIG_CB4X4
 #define EXT_TX_SIZES 4  // number of sizes that use extended transforms
 #else
 #define EXT_TX_SIZES 3  // number of sizes that use extended transforms
 #endif
+#endif  // CONFIG_EXT_TX
 
 typedef enum {
   AOM_LAST_FLAG = 1 << 0,
@@ -191,39 +277,95 @@
 } CLPF_BLOCK_SIZE;
 #endif
 
-#define DC_PRED 0    // Average of above and left pixels
-#define V_PRED 1     // Vertical
-#define H_PRED 2     // Horizontal
-#define D45_PRED 3   // Directional 45  deg = round(arctan(1/1) * 180/pi)
-#define D135_PRED 4  // Directional 135 deg = 180 - 45
-#define D117_PRED 5  // Directional 117 deg = 180 - 63
-#define D153_PRED 6  // Directional 153 deg = 180 - 27
-#define D207_PRED 7  // Directional 207 deg = 180 + 27
-#define D63_PRED 8   // Directional 63  deg = round(arctan(2/1) * 180/pi)
-#define TM_PRED 9    // True-motion
-#define NEARESTMV 10
-#define NEARMV 11
-#define ZEROMV 12
-#define NEWMV 13
-#define MB_MODE_COUNT 14
-typedef uint8_t PREDICTION_MODE;
+typedef enum ATTRIBUTE_PACKED {
+  DC_PRED,    // Average of above and left pixels
+  V_PRED,     // Vertical
+  H_PRED,     // Horizontal
+  D45_PRED,   // Directional 45  deg = round(arctan(1/1) * 180/pi)
+  D135_PRED,  // Directional 135 deg = 180 - 45
+  D117_PRED,  // Directional 117 deg = 180 - 63
+  D153_PRED,  // Directional 153 deg = 180 - 27
+  D207_PRED,  // Directional 207 deg = 180 + 27
+  D63_PRED,   // Directional 63  deg = round(arctan(2/1) * 180/pi)
+  TM_PRED,    // True-motion
+  NEARESTMV,
+  NEARMV,
+  ZEROMV,
+  NEWMV,
+#if CONFIG_EXT_INTER
+  NEWFROMNEARMV,
+  NEAREST_NEARESTMV,
+  NEAREST_NEARMV,
+  NEAR_NEARESTMV,
+  NEAR_NEARMV,
+  NEAREST_NEWMV,
+  NEW_NEARESTMV,
+  NEAR_NEWMV,
+  NEW_NEARMV,
+  ZERO_ZEROMV,
+  NEW_NEWMV,
+#endif  // CONFIG_EXT_INTER
+  MB_MODE_COUNT,
+  INTRA_MODES = TM_PRED + 1
+} PREDICTION_MODE;
 
-#define INTRA_MODES (TM_PRED + 1)
+typedef enum {
+  SIMPLE_TRANSLATION = 0,
+#if CONFIG_MOTION_VAR
+  OBMC_CAUSAL,  // 2-sided OBMC
+#endif          // CONFIG_MOTION_VAR
+#if CONFIG_WARPED_MOTION
+  WARPED_CAUSAL,  // 2-sided WARPED
+#endif            // CONFIG_WARPED_MOTION
+  MOTION_MODES
+} MOTION_MODE;
 
-#define INTER_MODES (1 + NEWMV - NEARESTMV)
+#if CONFIG_EXT_INTER
+typedef enum {
+  II_DC_PRED = 0,
+  II_V_PRED,
+  II_H_PRED,
+  II_D45_PRED,
+  II_D135_PRED,
+  II_D117_PRED,
+  II_D153_PRED,
+  II_D207_PRED,
+  II_D63_PRED,
+  II_TM_PRED,
+  INTERINTRA_MODES
+} INTERINTRA_MODE;
+
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_FILTER_INTRA
+typedef enum {
+  FILTER_DC_PRED,
+  FILTER_V_PRED,
+  FILTER_H_PRED,
+  FILTER_D45_PRED,
+  FILTER_D135_PRED,
+  FILTER_D117_PRED,
+  FILTER_D153_PRED,
+  FILTER_D207_PRED,
+  FILTER_D63_PRED,
+  FILTER_TM_PRED,
+  FILTER_INTRA_MODES,
+} FILTER_INTRA_MODE;
+#endif  // CONFIG_FILTER_INTRA
 
 #if CONFIG_EXT_INTRA
-// all intra modes except DC and TM
 #define DIRECTIONAL_MODES (INTRA_MODES - 2)
 #endif  // CONFIG_EXT_INTRA
 
-#if CONFIG_MOTION_VAR
-typedef enum {
-  SIMPLE_TRANSLATION = 0,  // regular block based motion compensation
-  OBMC_CAUSAL = 1,         // 2-sided overlapped block prediction
-  MOTION_MODES = 2
-} MOTION_MODE;
-#endif  // CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+#define INTER_MODES (1 + NEWFROMNEARMV - NEARESTMV)
+#else
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+#endif  // CONFIG_EXT_INTER
 
 #define SKIP_CONTEXTS 3
 
@@ -260,14 +402,22 @@
 
 #if CONFIG_REF_MV
 #define MAX_REF_MV_STACK_SIZE 16
+#if CONFIG_EXT_PARTITION
+#define REF_CAT_LEVEL 640
+#else
 #define REF_CAT_LEVEL 160
-#endif
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_REF_MV
 
 #define INTRA_INTER_CONTEXTS 4
 #define COMP_INTER_CONTEXTS 5
 #define REF_CONTEXTS 5
 
-// Reference frame types
+#if CONFIG_VAR_TX
+#define TXFM_PARTITION_CONTEXTS 16
+typedef uint8_t TXFM_CONTEXT;
+#endif
+
 #define NONE -1
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
@@ -278,27 +428,49 @@
 #define GOLDEN_FRAME 4
 #define BWDREF_FRAME 5
 #define ALTREF_FRAME 6
-#define MAX_REF_FRAMES 7
 #define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
 #else
 #define GOLDEN_FRAME 2
 #define ALTREF_FRAME 3
-#define MAX_REF_FRAMES 4
 #endif  // CONFIG_EXT_REFS
 
+#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
+#define TOTAL_REFS_PER_FRAME (ALTREF_FRAME - INTRA_FRAME + 1)
+
 #define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
 #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
-
 #if CONFIG_EXT_REFS
 #define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
 #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
 #else
 #define BWD_REFS 1
 #define BWD_RF_OFFSET(ref) (ref - ALTREF_FRAME)
-#endif
+#endif  // CONFIG_EXT_REFS
+
 #define SINGLE_REFS (FWD_REFS + BWD_REFS)
 #define COMP_REFS (FWD_REFS * BWD_REFS)
 
+#if CONFIG_REF_MV
+#define MODE_CTX_REF_FRAMES (TOTAL_REFS_PER_FRAME + COMP_REFS)
+#else
+#define MODE_CTX_REF_FRAMES TOTAL_REFS_PER_FRAME
+#endif
+
+#if CONFIG_SUPERTX
+#define PARTITION_SUPERTX_CONTEXTS 2
+#define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_LOOP_RESTORATION
+typedef enum {
+  RESTORE_NONE,
+  RESTORE_BILATERAL,
+  RESTORE_WIENER,
+  RESTORE_SWITCHABLE,
+  RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
+  RESTORE_TYPES,
+} RestorationType;
+#endif  // CONFIG_LOOP_RESTORATION
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/filter.c b/av1/common/filter.c
index 573f51a..21526fc 100644
--- a/av1/common/filter.c
+++ b/av1/common/filter.c

@@ -13,8 +13,8 @@
 
 #include "av1/common/filter.h"
 
-DECLARE_ALIGNED(256, static const int16_t,
-                bilinear_filters[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128, 0, 0, 0, 0 },  { 0, 0, 0, 120, 8, 0, 0, 0 },
   { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
   { 0, 0, 0, 96, 32, 0, 0, 0 },  { 0, 0, 0, 88, 40, 0, 0, 0 },
@@ -25,55 +25,58 @@
   { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
 };
 
-// Lagrangian interpolation filter
-DECLARE_ALIGNED(256, static const int16_t,
-                sub_pel_filters_8[SUBPEL_SHIFTS][8]) = {
-#if CONFIG_FILTER_7BIT
-  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
-  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
-  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
-  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
-  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
-  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
-  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
-  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
-#else
-  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 1, -5, 126, 8, -3, 1, 0 },
-  { -1, 3, -10, 122, 18, -6, 2, 0 },   { -1, 4, -13, 118, 27, -9, 3, -1 },
-  { -1, 4, -16, 112, 37, -11, 4, -1 }, { -1, 5, -18, 105, 48, -14, 4, -1 },
-  { -1, 5, -19, 97, 58, -16, 5, -1 },  { -1, 6, -19, 88, 68, -18, 5, -1 },
-  { -1, 6, -19, 78, 78, -19, 6, -1 },  { -1, 5, -18, 68, 88, -19, 6, -1 },
-  { -1, 5, -16, 58, 97, -19, 5, -1 },  { -1, 4, -14, 48, 105, -18, 5, -1 },
-  { -1, 4, -11, 37, 112, -16, 4, -1 }, { -1, 3, -9, 27, 118, -13, 4, -1 },
-  { 0, 2, -6, 18, 122, -10, 3, -1 },   { 0, 1, -3, 8, 126, -5, 1, 0 }
-#endif
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static const int16_t,
+                sub_pel_filters_temporalfilter_12[SUBPEL_SHIFTS][12]) = {
+  // intfilt 0.8
+  { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 },
+  { 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0 },
+  { 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0 },
+  { -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1 },
+  { -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1 },
+  { -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1 },
+  { -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1 },
+  { -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1 },
+  { -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1 },
+  { -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1 },
+  { -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1 },
+  { -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1 },
+  { -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1 },
+  { -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1 },
+  { 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0 },
+  { 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0 },
 };
-
-// DCT based filter
-DECLARE_ALIGNED(256, static const int16_t,
-                sub_pel_filters_8sharp[SUBPEL_SHIFTS][8]) = {
-#if CONFIG_FILTER_7BIT
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
-  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
-  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
-  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
-  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
-  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
-  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
-  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
-#else
-  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 3, -7, 127, 8, -3, 1, 0 },
-  { -2, 5, -13, 125, 17, -6, 3, -1 },   { -3, 7, -17, 121, 27, -10, 5, -2 },
-  { -4, 9, -20, 115, 37, -13, 6, -2 },  { -4, 10, -23, 108, 48, -16, 8, -3 },
-  { -4, 10, -24, 100, 59, -19, 9, -3 }, { -4, 11, -24, 90, 70, -21, 10, -4 },
-  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -21, 70, 90, -24, 11, -4 },
-  { -3, 9, -19, 59, 100, -24, 10, -4 }, { -3, 8, -16, 48, 108, -23, 10, -4 },
-  { -2, 6, -13, 37, 115, -20, 9, -4 },  { -2, 5, -10, 27, 121, -17, 7, -3 },
-  { -1, 3, -6, 17, 125, -13, 5, -2 },   { 0, 1, -3, 8, 127, -7, 3, -1 }
-#endif
-};
+#endif  // USE_TEMPORALFILTER_12TAP
 
 #if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  // intfilt 0.575
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 1, -5, 126, 8, -3, 1, 0 },
+  { -1, 3, -10, 123, 18, -6, 2, -1 },  { -1, 4, -14, 118, 27, -9, 3, 0 },
+  { -1, 5, -16, 112, 37, -12, 4, -1 }, { -1, 5, -18, 105, 48, -14, 4, -1 },
+  { -1, 6, -19, 97, 58, -17, 5, -1 },  { -1, 6, -20, 88, 68, -18, 6, -1 },
+  { -1, 6, -19, 78, 78, -19, 6, -1 },  { -1, 6, -18, 68, 88, -20, 6, -1 },
+  { -1, 5, -17, 58, 97, -19, 6, -1 },  { -1, 4, -14, 48, 105, -18, 5, -1 },
+  { -1, 4, -12, 37, 112, -16, 5, -1 }, { 0, 3, -9, 27, 118, -14, 4, -1 },
+  { -1, 2, -6, 18, 123, -10, 3, -1 },  { 0, 1, -3, 8, 126, -5, 1, 0 },
+};
+
+#if CONFIG_EXT_INTRA
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  // intfilt 0.8
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 2, -6, 127, 9, -4, 2, -1 },
+  { -2, 5, -12, 124, 18, -7, 4, -2 },   { -2, 7, -16, 119, 28, -11, 5, -2 },
+  { -3, 8, -19, 114, 38, -14, 7, -3 },  { -3, 9, -22, 107, 49, -17, 8, -3 },
+  { -4, 10, -23, 99, 60, -20, 10, -4 }, { -4, 11, -23, 90, 70, -22, 10, -4 },
+  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -22, 70, 90, -23, 11, -4 },
+  { -4, 10, -20, 60, 99, -23, 10, -4 }, { -3, 8, -17, 49, 107, -22, 9, -3 },
+  { -3, 7, -14, 38, 114, -19, 8, -3 },  { -2, 5, -11, 28, 119, -16, 7, -2 },
+  { -2, 4, -7, 18, 124, -12, 5, -2 },   { -1, 2, -4, 9, 127, -6, 2, -1 },
+};
+#endif  // CONFIG_EXT_INTRA
+
 DECLARE_ALIGNED(256, static const int16_t,
                 sub_pel_filters_10sharp[SUBPEL_SHIFTS][10]) = {
   // intfilt 0.77
@@ -95,6 +98,32 @@
   { 0, -1, 2, -4, 8, 127, -6, 3, -1, 0 },
 };
 
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth2[SUBPEL_SHIFTS]) = {
+  // freqmultiplier = 0.35
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { -1, 8, 31, 47, 34, 10, 0, -1 },
+  { -1, 7, 29, 46, 36, 12, 0, -1 }, { -1, 6, 28, 46, 37, 13, 0, -1 },
+  { -1, 5, 26, 46, 38, 14, 1, -1 }, { -1, 4, 25, 45, 39, 16, 1, -1 },
+  { -1, 4, 23, 44, 41, 17, 1, -1 }, { -1, 3, 21, 44, 42, 18, 2, -1 },
+  { -1, 2, 20, 43, 43, 20, 2, -1 }, { -1, 2, 18, 42, 44, 21, 3, -1 },
+  { -1, 1, 17, 41, 44, 23, 4, -1 }, { -1, 1, 16, 39, 45, 25, 4, -1 },
+  { -1, 1, 14, 38, 46, 26, 5, -1 }, { -1, 0, 13, 37, 46, 28, 6, -1 },
+  { -1, 0, 12, 36, 46, 29, 7, -1 }, { -1, 0, 10, 34, 47, 31, 8, -1 },
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+  // freqmultiplier = 0.75
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 2, -10, 19, 95, 31, -11, 2, 0 },
+  { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -8, 9, 92, 43, -12, 1, 1 },
+  { 2, -7, 5, 90, 49, -12, 1, 0 },  { 2, -5, 1, 86, 55, -12, 0, 1 },
+  { 1, -4, -2, 82, 61, -11, 0, 1 }, { 1, -3, -5, 77, 67, -9, -1, 1 },
+  { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -1, -9, 67, 77, -5, -3, 1 },
+  { 1, 0, -11, 61, 82, -2, -4, 1 }, { 1, 0, -12, 55, 86, 1, -5, 2 },
+  { 0, 1, -12, 49, 90, 5, -7, 2 },  { 1, 1, -12, 43, 92, 9, -8, 2 },
+  { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -11, 31, 95, 19, -10, 2 },
+};
+
 DECLARE_ALIGNED(16, static const int16_t,
                 sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = {
   // intfilt 0.85
@@ -115,38 +144,56 @@
   { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1 },
   { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0 },
 };
-
-DECLARE_ALIGNED(256, static const int16_t,
-                sub_pel_filters_8smooth[SUBPEL_SHIFTS][8]) = {
-  // freqmultiplier = 0.75
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 2, -10, 19, 95, 31, -11, 2, 0 },
-  { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -8, 9, 92, 43, -12, 1, 1 },
-  { 2, -7, 5, 90, 49, -12, 1, 0 },  { 2, -5, 1, 86, 55, -12, 0, 1 },
-  { 1, -4, -2, 82, 61, -11, 0, 1 }, { 1, -3, -5, 77, 67, -9, -1, 1 },
-  { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -1, -9, 67, 77, -5, -3, 1 },
-  { 1, 0, -11, 61, 82, -2, -4, 1 }, { 1, 0, -12, 55, 86, 1, -5, 2 },
-  { 0, 1, -12, 49, 90, 5, -7, 2 },  { 1, 1, -12, 43, 92, 9, -8, 2 },
-  { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -11, 31, 95, 19, -10, 2 },
-};
-
-DECLARE_ALIGNED(256, static const int16_t,
-                sub_pel_filters_8smooth2[SUBPEL_SHIFTS][8]) = {
-  // freqmultiplier = 0.35
-  { 0, 0, 0, 128, 0, 0, 0, 0 },     { -1, 8, 31, 47, 34, 10, 0, -1 },
-  { -1, 7, 29, 46, 36, 12, 0, -1 }, { -1, 6, 28, 46, 37, 13, 0, -1 },
-  { -1, 5, 26, 46, 38, 14, 1, -1 }, { -1, 4, 25, 45, 39, 16, 1, -1 },
-  { -1, 4, 23, 44, 41, 17, 1, -1 }, { -1, 3, 21, 44, 42, 18, 2, -1 },
-  { -1, 2, 20, 43, 43, 20, 2, -1 }, { -1, 2, 18, 42, 44, 21, 3, -1 },
-  { -1, 1, 17, 41, 44, 23, 4, -1 }, { -1, 1, 16, 39, 45, 25, 4, -1 },
-  { -1, 1, 14, 38, 46, 26, 5, -1 }, { -1, 0, 13, 37, 46, 28, 6, -1 },
-  { -1, 0, 12, 36, 46, 29, 7, -1 }, { -1, 0, 10, 34, 47, 31, 8, -1 },
-};
-
 #else  // CONFIG_EXT_INTERP
 
-// freqmultiplier = 0.5
-DECLARE_ALIGNED(256, static const int16_t,
-                sub_pel_filters_8smooth[SUBPEL_SHIFTS][8]) = {
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+#if CONFIG_FILTER_7BIT
+  { 0, 0, 0, 128, 0, 0, 0, 0 },      { 0, 2, -6, 126, 8, -2, 0, 0 },
+  { 0, 2, -10, 122, 18, -4, 0, 0 },  { 0, 2, -12, 116, 28, -8, 2, 0 },
+  { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+  { 0, 2, -16, 94, 58, -12, 2, 0 },  { 0, 2, -14, 84, 66, -12, 2, 0 },
+  { 0, 2, -14, 76, 76, -14, 2, 0 },  { 0, 2, -12, 66, 84, -14, 2, 0 },
+  { 0, 2, -12, 58, 94, -16, 2, 0 },  { 0, 2, -12, 48, 102, -14, 2, 0 },
+  { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+  { 0, 0, -4, 18, 122, -10, 2, 0 },  { 0, 0, -2, 8, 126, -6, 2, 0 }
+#else
+  { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 1, -5, 126, 8, -3, 1, 0 },
+  { -1, 3, -10, 122, 18, -6, 2, 0 },   { -1, 4, -13, 118, 27, -9, 3, -1 },
+  { -1, 4, -16, 112, 37, -11, 4, -1 }, { -1, 5, -18, 105, 48, -14, 4, -1 },
+  { -1, 5, -19, 97, 58, -16, 5, -1 },  { -1, 6, -19, 88, 68, -18, 5, -1 },
+  { -1, 6, -19, 78, 78, -19, 6, -1 },  { -1, 5, -18, 68, 88, -19, 6, -1 },
+  { -1, 5, -16, 58, 97, -19, 5, -1 },  { -1, 4, -14, 48, 105, -18, 5, -1 },
+  { -1, 4, -11, 37, 112, -16, 4, -1 }, { -1, 3, -9, 27, 118, -13, 4, -1 },
+  { 0, 2, -6, 18, 122, -10, 3, -1 },   { 0, 1, -3, 8, 126, -5, 1, 0 }
+#endif
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+#if CONFIG_FILTER_7BIT
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -2, 2, -6, 126, 8, -2, 2, 0 },
+  { -2, 6, -12, 124, 16, -6, 4, -2 },   { -2, 8, -18, 120, 26, -10, 6, -2 },
+  { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+  { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+  { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+  { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+  { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+  { -2, 4, -6, 16, 124, -12, 6, -2 },   { 0, 2, -2, 8, 126, -6, 2, -2 }
+#else
+  { 0, 0, 0, 128, 0, 0, 0, 0 },         { -1, 3, -7, 127, 8, -3, 1, 0 },
+  { -2, 5, -13, 125, 17, -6, 3, -1 },   { -3, 7, -17, 121, 27, -10, 5, -2 },
+  { -4, 9, -20, 115, 37, -13, 6, -2 },  { -4, 10, -23, 108, 48, -16, 8, -3 },
+  { -4, 10, -24, 100, 59, -19, 9, -3 }, { -4, 11, -24, 90, 70, -21, 10, -4 },
+  { -4, 11, -23, 80, 80, -23, 11, -4 }, { -4, 10, -21, 70, 90, -24, 11, -4 },
+  { -3, 9, -19, 59, 100, -24, 10, -4 }, { -3, 8, -16, 48, 108, -23, 10, -4 },
+  { -2, 6, -13, 37, 115, -20, 9, -4 },  { -2, 5, -10, 27, 121, -17, 7, -3 },
+  { -1, 3, -6, 17, 125, -13, 5, -2 },   { 0, 1, -3, 8, 127, -7, 3, -1 }
+#endif
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
 #if CONFIG_FILTER_7BIT
   { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 2, 28, 62, 34, 2, 0, 0 },
   { 0, 0, 26, 62, 36, 4, 0, 0 },    { 0, 0, 22, 62, 40, 4, 0, 0 },
@@ -167,41 +214,68 @@
   { 0, -3, 2, 41, 63, 29, -2, -2 },   { 0, -3, 1, 38, 64, 32, -1, -3 }
 #endif
 };
-
 #endif  // CONFIG_EXT_INTERP
 
-const InterpKernel *av1_filter_kernels[4] = { sub_pel_filters_8,
-                                              sub_pel_filters_8smooth,
-                                              sub_pel_filters_8sharp,
-                                              bilinear_filters };
+#if CONFIG_EXT_INTRA
+const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS] = {
+  bilinear_filters,         // INTRA_FILTER_LINEAR
+  sub_pel_filters_8,        // INTRA_FILTER_8TAP
+  sub_pel_filters_8sharp,   // INTRA_FILTER_8TAP_SHARP
+  sub_pel_filters_8smooth,  // INTRA_FILTER_8TAP_SMOOTH
+};
+#endif  // CONFIG_EXT_INTRA
+
 #if CONFIG_EXT_INTERP
-static const InterpFilterParams interp_filter_params_list[SWITCHABLE_FILTERS +
-                                                          1] = {
-  { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP },
-  { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_SMOOTH },
-  { (const int16_t *)sub_pel_filters_10sharp, 10, SUBPEL_SHIFTS,
-    MULTITAP_SHARP },
-  { (const int16_t *)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_SMOOTH2 },
-  { (const int16_t *)sub_pel_filters_12sharp, 12, SUBPEL_SHIFTS,
-    MULTITAP_SHARP2 },
-  { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS, BILINEAR }
-};
-#else   // CONFIG_EXT_INTERP
-static const InterpFilterParams interp_filter_params_list[SWITCHABLE_FILTERS +
-                                                          1] = {
-  { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP },
-  { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_SMOOTH },
-  { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
-    EIGHTTAP_SHARP },
-  { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS, BILINEAR }
-};
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH },
+      { (const int16_t *)sub_pel_filters_10sharp, 10, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+      { (const int16_t *)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH2 },
+      { (const int16_t *)sub_pel_filters_12sharp, 12, SUBPEL_SHIFTS,
+        MULTITAP_SHARP2 },
+      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR }
+    };
+#else
+static const InterpFilterParams
+    av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+      { (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_REGULAR },
+      { (const int16_t *)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        EIGHTTAP_SMOOTH },
+      { (const int16_t *)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        MULTITAP_SHARP },
+      { (const int16_t *)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+        BILINEAR }
+    };
 #endif  // CONFIG_EXT_INTERP
 
-InterpFilterParams get_interp_filter_params(InterpFilter interp_filter) {
-  InterpFilterParams params = interp_filter_params_list[interp_filter];
-  assert(params.interp_filter == interp_filter);
-  return params;
+#if USE_TEMPORALFILTER_12TAP
+static const InterpFilterParams av1_interp_temporalfilter_12tap = {
+  (const int16_t *)sub_pel_filters_temporalfilter_12, 12, SUBPEL_SHIFTS,
+  TEMPORALFILTER_12TAP
+};
+#endif  // USE_TEMPORALFILTER_12TAP
+
+InterpFilterParams av1_get_interp_filter_params(
+    const InterpFilter interp_filter) {
+#if USE_TEMPORALFILTER_12TAP
+  if (interp_filter == TEMPORALFILTER_12TAP)
+    return av1_interp_temporalfilter_12tap;
+#endif  // USE_TEMPORALFILTER_12TAP
+  return av1_interp_filter_params_list[interp_filter];
+}
+
+const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter) {
+#if USE_TEMPORALFILTER_12TAP
+  if (interp_filter == TEMPORALFILTER_12TAP)
+    return av1_interp_temporalfilter_12tap.filter_ptr;
+#endif  // USE_TEMPORALFILTER_12TAP
+  return (const int16_t *)av1_interp_filter_params_list[interp_filter]
+      .filter_ptr;
 }

diff --git a/av1/common/filter.h b/av1/common/filter.h
index bb9e60c..eb39a7f 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h

@@ -21,38 +21,58 @@
 extern "C" {
 #endif
 
-#if CONFIG_EXT_INTERP
-#define EIGHTTAP 0
+#define EIGHTTAP_REGULAR 0
 #define EIGHTTAP_SMOOTH 1
-#define EIGHTTAP_SHARP 2
-#define MULTITAP_SHARP EIGHTTAP_SHARP
+#define MULTITAP_SHARP 2
+
+#if CONFIG_EXT_INTERP
 #define EIGHTTAP_SMOOTH2 3
 #define MULTITAP_SHARP2 4
-#define SWITCHABLE_FILTERS 5 /* Number of switchable filters */
 
-// (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS
-#define LOG_SWITCHABLE_FILTERS 3
+#define MAX_SUBPEL_TAPS 12
 
+#define SUPPORT_NONINTERPOLATING_FILTERS 0 /* turn on for experimentation */
+#define SWITCHABLE_FILTERS 5               /* Number of switchable filters */
+#define LOG_SWITCHABLE_FILTERS \
+  3 /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
 #else
-#define EIGHTTAP 0
-#define EIGHTTAP_SMOOTH 1
-#define EIGHTTAP_SHARP 2
 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */
-
-// (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS
-#define LOG_SWITCHABLE_FILTERS 2
-
+#define LOG_SWITCHABLE_FILTERS \
+  2     /* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
 #endif  // CONFIG_EXT_INTERP
-#define BILINEAR SWITCHABLE_FILTERS
+
+#define USE_TEMPORALFILTER_12TAP 1
+#if USE_TEMPORALFILTER_12TAP
+#define TEMPORALFILTER_12TAP (SWITCHABLE_FILTERS + 1)
+#endif
 
 // The codec can operate in four possible inter prediction filter mode:
 // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+
+#define BILINEAR (SWITCHABLE_FILTERS)
+#define SWITCHABLE (SWITCHABLE_FILTERS + 1) /* the last one */
+#if CONFIG_DUAL_FILTER
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+#else
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
-#define SWITCHABLE (SWITCHABLE_FILTERS + 1) /* should be the last one */
+#endif
 
 typedef uint8_t InterpFilter;
 
-extern const InterpKernel *av1_filter_kernels[4];
+#if CONFIG_EXT_INTRA
+typedef enum {
+  INTRA_FILTER_LINEAR,
+  INTRA_FILTER_8TAP,
+  INTRA_FILTER_8TAP_SHARP,
+  INTRA_FILTER_8TAP_SMOOTH,
+  INTRA_FILTERS,
+} INTRA_FILTER;
+
+extern const InterpKernel *av1_intra_filter_kernels[INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
+
 typedef struct InterpFilterParams {
   const int16_t *filter_ptr;
   uint16_t taps;
@@ -60,11 +80,21 @@
   InterpFilter interp_filter;
 } InterpFilterParams;
 
-static INLINE const int16_t *get_interp_filter_subpel_kernel(
+InterpFilterParams av1_get_interp_filter_params(
+    const InterpFilter interp_filter);
+
+const int16_t *av1_get_interp_filter_kernel(const InterpFilter interp_filter);
+
+static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
     const InterpFilterParams filter_params, const int subpel) {
   return filter_params.filter_ptr + filter_params.taps * subpel;
 }
-InterpFilterParams get_interp_filter_params(InterpFilter interp_filter);
+
+static INLINE int av1_is_interpolating_filter(
+    const InterpFilter interp_filter) {
+  const InterpFilterParams ip = av1_get_interp_filter_params(interp_filter);
+  return (ip.filter_ptr[ip.taps / 2 - 1] == 128);
+}
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/common/idct.c b/av1/common/idct.c
index e3269ec..2663d2d 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c

@@ -11,111 +11,996 @@
 
 #include <math.h>
 
-#include "./av1_rtcd.h"
 #include "./aom_dsp_rtcd.h"
-#include "av1/common/blockd.h"
-#include "av1/common/idct.h"
+#include "./av1_rtcd.h"
 #include "aom_dsp/inv_txfm.h"
 #include "aom_ports/mem.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+                 const TX_SIZE tx_size) {
+  (void)tx_type;
+  (void)xd;
+  if (txsize_sqr_up_map[tx_size] == TX_32X32) return 1;
+#if CONFIG_TX64X64
+  else if (txsize_sqr_up_map[tx_size] == TX_64X64)
+    return 2;
+#endif  // CONFIG_TX64X64
+  else
+    return 0;
+}
+
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
+#if CONFIG_EXT_TX
+static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+}
+
+static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+}
+
+static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+}
+
+#if CONFIG_TX64X64
+static void iidtx64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
+
+// For use in lieu of ADST
+static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  aom_idct16_c(inputhalf, output + 16);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_TX64X64
+static void idct64_col_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void idct64_row_c(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+// For use in lieu of ADST
+static void ihalfright64_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = (tran_low_t)dct_const_round_shift(input[32 + i] * 4 * Sqrt2);
+  }
+  aom_idct32_c(inputhalf, output + 32);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
+static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
+}
+
+static void highbd_iidtx8_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; ++i) output[i] = input[i] * 2;
+}
+
+static void highbd_iidtx16_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 2 * Sqrt2), bd);
+}
+
+static void highbd_iidtx32_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  (void)bd;
+  for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
+}
+#endif  // CONFIG_EXT_TX
+
+static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
+                                  int bd) {
+  int i;
+  tran_low_t inputhalf[16];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
+  }
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  aom_highbd_idct16_c(inputhalf, output + 16, bd);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_EXT_TX
+#if CONFIG_TX64X64
+static void highbd_iidtx64_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * 4 * Sqrt2), bd);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_TX64X64
+// For use in lieu of ADST
+static void highbd_ihalfright64_c(const tran_low_t *input, tran_low_t *output,
+                                  int bd) {
+  int i;
+  tran_low_t inputhalf[32];
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] =
+        HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[i] * Sqrt2), bd);
+  }
+  for (i = 0; i < 32; ++i) {
+    output[i] = HIGHBD_WRAPLOW(
+        highbd_dct_const_round_shift(input[32 + i] * 4 * Sqrt2), bd);
+  }
+  aom_highbd_idct32_c(inputhalf, output + 32, bd);
+  // Note overall scaling factor is 4 * sqrt(2)  times orthogonal
+}
+
+static void highbd_idct64_col_c(const tran_low_t *input, tran_low_t *output,
+                                int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_col_dct_dct_64,
+                 inv_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void highbd_idct64_row_c(const tran_low_t *input, tran_low_t *output,
+                                int bd) {
+  int32_t in[64], out[64];
+  int i;
+  (void)bd;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_idct64_new(in, out, inv_cos_bit_row_dct_dct_64,
+                 inv_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+// Inverse identity transform and add.
+#if CONFIG_EXT_TX
+static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
+      dest += stride;
+      input += bs;
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#define FLIPUD_PTR(dest, stride, size)       \
+  do {                                       \
+    (dest) = (dest) + ((size)-1) * (stride); \
+    (stride) = -(stride);                    \
+  } while (0)
+
+#if CONFIG_EXT_TX
+static void maybe_flip_strides(uint8_t **dst, int *dstride, tran_low_t **src,
+                               int *sstride, int tx_type, int sizey,
+                               int sizex) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_EXT_TX
+static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bs, int tx_type, int bd) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
+      dest += stride;
+      input += bs;
+    }
+  }
+}
+
+static void maybe_flip_strides16(uint16_t **dst, int *dstride, tran_low_t **src,
+                                 int *sstride, int tx_type, int sizey,
+                                 int sizex) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, sizey);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, sizex);
+      break;
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void av1_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
-  const transform_2d IHT_4[] = {
-    { aom_idct4_c, aom_idct4_c },   // DCT_DCT  = 0
-    { aom_iadst4_c, aom_idct4_c },  // ADST_DCT = 1
-    { aom_idct4_c, aom_iadst4_c },  // DCT_ADST = 2
-    { aom_iadst4_c, aom_iadst4_c }  // ADST_ADST = 3
+  static const transform_2d IHT_4[] = {
+    { aom_idct4_c, aom_idct4_c },    // DCT_DCT  = 0
+    { aom_iadst4_c, aom_idct4_c },   // ADST_DCT = 1
+    { aom_idct4_c, aom_iadst4_c },   // DCT_ADST = 2
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx4_c },          // IDTX
+    { aom_idct4_c, iidtx4_c },       // V_DCT
+    { iidtx4_c, aom_idct4_c },       // H_DCT
+    { aom_iadst4_c, iidtx4_c },      // V_ADST
+    { iidtx4_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst4_c, iidtx4_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst4_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
   };
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr);
+    IHT_4[tx_type].rows(input, out[i]);
     input += 4;
-    outptr += 4;
+  }
+
+  // transpose
+  for (i = 1; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
     }
   }
 }
 
-static const transform_2d IHT_8[] = {
-  { aom_idct8_c, aom_idct8_c },   // DCT_DCT  = 0
-  { aom_iadst8_c, aom_idct8_c },  // ADST_DCT = 1
-  { aom_idct8_c, aom_iadst8_c },  // DCT_ADST = 2
-  { aom_iadst8_c, aom_iadst8_c }  // ADST_ADST = 3
-};
+void av1_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  static const transform_2d IHT_4x8[] = {
+    { aom_idct8_c, aom_idct4_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct4_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct4_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst4_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst4_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx4_c },          // IDTX
+    { aom_idct8_c, iidtx4_c },       // V_DCT
+    { iidtx8_c, aom_idct4_c },       // H_DCT
+    { aom_iadst8_c, iidtx4_c },      // V_ADST
+    { iidtx8_c, aom_iadst4_c },      // H_ADST
+    { aom_iadst8_c, iidtx4_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst4_c },      // H_FLIPADST
+#endif
+  };
+
+  const int n = 4;
+  const int n2 = 8;
+  int i, j;
+  tran_low_t out[4][8], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n2; ++i) {
+    IHT_4x8[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    IHT_4x8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void av1_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                         int tx_type) {
+  static const transform_2d IHT_8x4[] = {
+    { aom_idct4_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst4_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst4_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct4_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst4_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx8_c },          // IDTX
+    { aom_idct4_c, iidtx8_c },       // V_DCT
+    { iidtx4_c, aom_idct8_c },       // H_DCT
+    { aom_iadst4_c, iidtx8_c },      // V_ADST
+    { iidtx4_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst4_c, iidtx8_c },      // V_FLIPADST
+    { iidtx4_c, aom_iadst8_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 4;
+  const int n2 = 8;
+
+  int i, j;
+  tran_low_t out[8][4], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_8x4[tx_type].rows(input, outtmp);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    IHT_8x4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+
+void av1_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  static const transform_2d IHT_8x16[] = {
+    { aom_idct16_c, aom_idct8_c },    // DCT_DCT
+    { aom_iadst16_c, aom_idct8_c },   // ADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_ADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx8_c },          // IDTX
+    { aom_idct16_c, iidtx8_c },       // V_DCT
+    { iidtx16_c, aom_idct8_c },       // H_DCT
+    { aom_iadst16_c, iidtx8_c },      // V_ADST
+    { iidtx16_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst16_c, iidtx8_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst8_c },      // H_FLIPADST
+#endif
+  };
+
+  const int n = 8;
+  const int n2 = 16;
+  int i, j;
+  tran_low_t out[8][16], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n2; ++i) {
+    IHT_8x16[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    IHT_8x16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  static const transform_2d IHT_16x8[] = {
+    { aom_idct8_c, aom_idct16_c },    // DCT_DCT
+    { aom_iadst8_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_ADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx16_c },          // IDTX
+    { aom_idct8_c, iidtx16_c },       // V_DCT
+    { iidtx8_c, aom_idct16_c },       // H_DCT
+    { aom_iadst8_c, iidtx16_c },      // V_ADST
+    { iidtx8_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst8_c, iidtx16_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst16_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 8;
+  const int n2 = 16;
+
+  int i, j;
+  tran_low_t out[16][8], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_16x8[tx_type].rows(input, outtmp);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    IHT_16x8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  static const transform_2d IHT_16x32[] = {
+    { aom_idct32_c, aom_idct16_c },     // DCT_DCT
+    { ihalfright32_c, aom_idct16_c },   // ADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { ihalfright32_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct32_c, aom_iadst16_c },    // DCT_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // ADST_FLIPADST
+    { ihalfright32_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx32_c, iidtx16_c },           // IDTX
+    { aom_idct32_c, iidtx16_c },        // V_DCT
+    { iidtx32_c, aom_idct16_c },        // H_DCT
+    { ihalfright32_c, iidtx16_c },      // V_ADST
+    { iidtx32_c, aom_iadst16_c },       // H_ADST
+    { ihalfright32_c, iidtx16_c },      // V_FLIPADST
+    { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
+#endif
+  };
+
+  const int n = 16;
+  const int n2 = 32;
+  int i, j;
+  tran_low_t out[16][32], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n2; ++i) {
+    IHT_16x32[tx_type].rows(input, outtmp);
+    for (j = 0; j < n; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    IHT_16x32[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+
+void av1_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  static const transform_2d IHT_32x16[] = {
+    { aom_idct16_c, aom_idct32_c },     // DCT_DCT
+    { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_ADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct16_c, ihalfright32_c },   // DCT_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // ADST_FLIPADST
+    { aom_iadst16_c, ihalfright32_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx32_c },           // IDTX
+    { aom_idct16_c, iidtx32_c },        // V_DCT
+    { iidtx16_c, aom_idct32_c },        // H_DCT
+    { aom_iadst16_c, iidtx32_c },       // V_ADST
+    { iidtx16_c, ihalfright32_c },      // H_ADST
+    { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
+    { iidtx16_c, ihalfright32_c },      // H_FLIPADST
+#endif
+  };
+  const int n = 16;
+  const int n2 = 32;
+
+  int i, j;
+  tran_low_t out[32][16], outtmp[32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors and transpose
+  for (i = 0; i < n; ++i) {
+    IHT_32x16[tx_type].rows(input, outtmp);
+    for (j = 0; j < n2; ++j)
+      out[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    IHT_32x16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
 
 void av1_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
+  static const transform_2d IHT_8[] = {
+    { aom_idct8_c, aom_idct8_c },    // DCT_DCT  = 0
+    { aom_iadst8_c, aom_idct8_c },   // ADST_DCT = 1
+    { aom_idct8_c, aom_iadst8_c },   // DCT_ADST = 2
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { aom_iadst8_c, aom_idct8_c },   // FLIPADST_DCT
+    { aom_idct8_c, aom_iadst8_c },   // DCT_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // ADST_FLIPADST
+    { aom_iadst8_c, aom_iadst8_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx8_c },          // IDTX
+    { aom_idct8_c, iidtx8_c },       // V_DCT
+    { iidtx8_c, aom_idct8_c },       // H_DCT
+    { aom_iadst8_c, iidtx8_c },      // V_ADST
+    { iidtx8_c, aom_iadst8_c },      // H_ADST
+    { aom_iadst8_c, iidtx8_c },      // V_FLIPADST
+    { iidtx8_c, aom_iadst8_c },      // H_FLIPADST
+#endif                               // CONFIG_EXT_TX
+  };
+
   int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const transform_2d ht = IHT_8[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
 
   // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr);
+    IHT_8[tx_type].rows(input, out[i]);
     input += 8;
-    outptr += 8;
+  }
+
+  // transpose
+  for (i = 1; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
     }
   }
 }
 
-static const transform_2d IHT_16[] = {
-  { aom_idct16_c, aom_idct16_c },   // DCT_DCT  = 0
-  { aom_iadst16_c, aom_idct16_c },  // ADST_DCT = 1
-  { aom_idct16_c, aom_iadst16_c },  // DCT_ADST = 2
-  { aom_iadst16_c, aom_iadst16_c }  // ADST_ADST = 3
-};
-
 void av1_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const transform_2d ht = IHT_16[tx_type];
+  static const transform_2d IHT_16[] = {
+    { aom_idct16_c, aom_idct16_c },    // DCT_DCT  = 0
+    { aom_iadst16_c, aom_idct16_c },   // ADST_DCT = 1
+    { aom_idct16_c, aom_iadst16_c },   // DCT_ADST = 2
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { aom_iadst16_c, aom_idct16_c },   // FLIPADST_DCT
+    { aom_idct16_c, aom_iadst16_c },   // DCT_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // ADST_FLIPADST
+    { aom_iadst16_c, aom_iadst16_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx16_c },          // IDTX
+    { aom_idct16_c, iidtx16_c },       // V_DCT
+    { iidtx16_c, aom_idct16_c },       // H_DCT
+    { aom_iadst16_c, iidtx16_c },      // V_ADST
+    { iidtx16_c, aom_iadst16_c },      // H_ADST
+    { aom_iadst16_c, iidtx16_c },      // V_FLIPADST
+    { iidtx16_c, aom_iadst16_c },      // H_FLIPADST
+#endif                                 // CONFIG_EXT_TX
+  };
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr);
+    IHT_16[tx_type].rows(input, out[i]);
     input += 16;
-    outptr += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
     }
   }
 }
 
+#if CONFIG_EXT_TX
+void av1_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  static const transform_2d IHT_32[] = {
+    { aom_idct32_c, aom_idct32_c },      // DCT_DCT
+    { ihalfright32_c, aom_idct32_c },    // ADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_ADST
+    { ihalfright32_c, ihalfright32_c },  // ADST_ADST
+    { ihalfright32_c, aom_idct32_c },    // FLIPADST_DCT
+    { aom_idct32_c, ihalfright32_c },    // DCT_FLIPADST
+    { ihalfright32_c, ihalfright32_c },  // FLIPADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },  // ADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },  // FLIPADST_ADST
+    { iidtx32_c, iidtx32_c },            // IDTX
+    { aom_idct32_c, iidtx32_c },         // V_DCT
+    { iidtx32_c, aom_idct32_c },         // H_DCT
+    { ihalfright32_c, iidtx32_c },       // V_ADST
+    { iidtx32_c, ihalfright32_c },       // H_ADST
+    { ihalfright32_c, iidtx32_c },       // V_FLIPADST
+    { iidtx32_c, ihalfright32_c },       // H_FLIPADST
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].rows(input, out[i]);
+    input += 32;
+  }
+
+  // transpose
+  for (i = 1; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].cols(out[i], out[i]);
+  }
+
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_TX64X64
+void av1_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                             int tx_type) {
+  static const transform_2d IHT_64[] = {
+    { idct64_col_c, idct64_row_c },      // DCT_DCT
+    { ihalfright64_c, idct64_row_c },    // ADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_ADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { ihalfright64_c, idct64_row_c },    // FLIPADST_DCT
+    { idct64_col_c, ihalfright64_c },    // DCT_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // ADST_FLIPADST
+    { ihalfright64_c, ihalfright64_c },  // FLIPADST_ADST
+    { iidtx64_c, iidtx64_c },            // IDTX
+    { idct64_col_c, iidtx64_c },         // V_DCT
+    { iidtx64_c, idct64_row_c },         // H_DCT
+    { ihalfright64_c, iidtx64_c },       // V_ADST
+    { iidtx64_c, ihalfright64_c },       // H_ADST
+    { ihalfright64_c, iidtx64_c },       // V_FLIPADST
+    { iidtx64_c, ihalfright64_c },       // H_FLIPADST
+#endif                                   // CONFIG_EXT_TX
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].rows(input, out[i]);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    IHT_64[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
+
 // idct
 void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob) {
@@ -179,20 +1064,82 @@
     aom_idct32x32_1024_add(input, dest, stride);
 }
 
+#if CONFIG_TX64X64
+void av1_idct64x64_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob) {
+  (void)eob;
+  av1_iht64x64_4096_add(input, dest, stride, DCT_DCT);
+}
+#endif  // CONFIG_TX64X64
+
 void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type, int lossless) {
   if (lossless) {
     assert(tx_type == DCT_DCT);
     av1_iwht4x4_add(input, dest, stride, eob);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT: av1_idct4x4_add(input, dest, stride, eob); break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST: av1_iht4x4_16_add(input, dest, stride, tx_type); break;
-      default: assert(0); break;
-    }
+    return;
   }
+
+  switch (tx_type) {
+    case DCT_DCT: av1_idct4x4_add(input, dest, stride, eob); break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST: av1_iht4x4_16_add(input, dest, stride, tx_type); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST: av1_iht4x4_16_add(input, dest, stride, tx_type); break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_iht4x4_16_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 4, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht4x8_32_add(input, dest, stride, tx_type);
+}
+
+void av1_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht8x4_32_add(input, dest, stride, tx_type);
+}
+
+void av1_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest, int stride,
+                           int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht8x16_128_add(input, dest, stride, tx_type);
+}
+
+void av1_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest, int stride,
+                           int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht16x8_128_add(input, dest, stride, tx_type);
+}
+
+void av1_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht16x32_512_add(input, dest, stride, tx_type);
+}
+
+void av1_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, TX_TYPE tx_type) {
+  (void)eob;
+  av1_iht32x16_512_add(input, dest, stride, tx_type);
 }
 
 void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
@@ -202,6 +1149,23 @@
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST: av1_iht8x8_64_add(input, dest, stride, tx_type); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST: av1_iht8x8_64_add(input, dest, stride, tx_type); break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_iht8x8_64_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 8, tx_type); break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
@@ -213,6 +1177,20 @@
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST: av1_iht16x16_256_add(input, dest, stride, tx_type); break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 16, tx_type); break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
@@ -221,115 +1199,758 @@
                             int eob, TX_TYPE tx_type) {
   switch (tx_type) {
     case DCT_DCT: av1_idct32x32_add(input, dest, stride, eob); break;
+#if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
-    case ADST_ADST: assert(0); break;
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_iht32x32_1024_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 32, tx_type); break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
 
+#if CONFIG_TX64X64
+void av1_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, TX_TYPE tx_type) {
+  switch (tx_type) {
+    case DCT_DCT: av1_idct64x64_add(input, dest, stride, eob); break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_iht64x64_4096_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX: inv_idtx_add_c(input, dest, stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  const highbd_transform_2d IHT_4[] = {
-    { aom_highbd_idct4_c, aom_highbd_idct4_c },   // DCT_DCT  = 0
-    { aom_highbd_iadst4_c, aom_highbd_idct4_c },  // ADST_DCT = 1
-    { aom_highbd_idct4_c, aom_highbd_iadst4_c },  // DCT_ADST = 2
-    { aom_highbd_iadst4_c, aom_highbd_iadst4_c }  // ADST_ADST = 3
+  static const highbd_transform_2d HIGH_IHT_4[] = {
+    { aom_highbd_idct4_c, aom_highbd_idct4_c },    // DCT_DCT
+    { aom_highbd_iadst4_c, aom_highbd_idct4_c },   // ADST_DCT
+    { aom_highbd_idct4_c, aom_highbd_iadst4_c },   // DCT_ADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst4_c, aom_highbd_idct4_c },   // FLIPADST_DCT
+    { aom_highbd_idct4_c, aom_highbd_iadst4_c },   // DCT_FLIPADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst4_c },  // ADST_FLIPADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst4_c },  // FLIPADST_ADST
+    { highbd_iidtx4_c, highbd_iidtx4_c },          // IDTX
+    { aom_highbd_idct4_c, highbd_iidtx4_c },       // V_DCT
+    { highbd_iidtx4_c, aom_highbd_idct4_c },       // H_DCT
+    { aom_highbd_iadst4_c, highbd_iidtx4_c },      // V_ADST
+    { highbd_iidtx4_c, aom_highbd_iadst4_c },      // H_ADST
+    { aom_highbd_iadst4_c, highbd_iidtx4_c },      // V_FLIPADST
+    { highbd_iidtx4_c, aom_highbd_iadst4_c },      // H_FLIPADST
+#endif                                             // CONFIG_EXT_TX
   };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
-  // Inverse transform row vectors.
+  // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr, bd);
+    HIGH_IHT_4[tx_type].rows(input, out[i], bd);
     input += 4;
-    outptr += 4;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    HIGH_IHT_4[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4), bd);
     }
   }
 }
 
-static const highbd_transform_2d HIGH_IHT_8[] = {
-  { aom_highbd_idct8_c, aom_highbd_idct8_c },   // DCT_DCT  = 0
-  { aom_highbd_iadst8_c, aom_highbd_idct8_c },  // ADST_DCT = 1
-  { aom_highbd_idct8_c, aom_highbd_iadst8_c },  // DCT_ADST = 2
-  { aom_highbd_iadst8_c, aom_highbd_iadst8_c }  // ADST_ADST = 3
-};
+void av1_highbd_iht4x8_32_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_4x8[] = {
+    { aom_highbd_idct8_c, aom_highbd_idct4_c },    // DCT_DCT
+    { aom_highbd_iadst8_c, aom_highbd_idct4_c },   // ADST_DCT
+    { aom_highbd_idct8_c, aom_highbd_iadst4_c },   // DCT_ADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst8_c, aom_highbd_idct4_c },   // FLIPADST_DCT
+    { aom_highbd_idct8_c, aom_highbd_iadst4_c },   // DCT_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst4_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst4_c },  // ADST_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst4_c },  // FLIPADST_ADST
+    { highbd_iidtx8_c, highbd_iidtx4_c },          // IDTX
+    { aom_highbd_idct8_c, highbd_iidtx4_c },       // V_DCT
+    { highbd_iidtx8_c, aom_highbd_idct4_c },       // H_DCT
+    { aom_highbd_iadst8_c, highbd_iidtx4_c },      // V_ADST
+    { highbd_iidtx8_c, aom_highbd_iadst4_c },      // H_ADST
+    { aom_highbd_iadst8_c, highbd_iidtx4_c },      // V_FLIPADST
+    { highbd_iidtx8_c, aom_highbd_iadst4_c },      // H_FLIPADST
+#endif                                             // CONFIG_EXT_TX
+  };
+  const int n = 4;
+  const int n2 = 8;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4][8], outtmp[4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_4x8[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < n; ++j) {
+      out[j][i] =
+          HIGHBD_WRAPLOW(highbd_dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    }
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_4x8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void av1_highbd_iht8x4_32_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_8x4[] = {
+    { aom_highbd_idct4_c, aom_highbd_idct8_c },    // DCT_DCT
+    { aom_highbd_iadst4_c, aom_highbd_idct8_c },   // ADST_DCT
+    { aom_highbd_idct4_c, aom_highbd_iadst8_c },   // DCT_ADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst4_c, aom_highbd_idct8_c },   // FLIPADST_DCT
+    { aom_highbd_idct4_c, aom_highbd_iadst8_c },   // DCT_FLIPADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst8_c },  // ADST_FLIPADST
+    { aom_highbd_iadst4_c, aom_highbd_iadst8_c },  // FLIPADST_ADST
+    { highbd_iidtx4_c, highbd_iidtx8_c },          // IDTX
+    { aom_highbd_idct4_c, highbd_iidtx8_c },       // V_DCT
+    { highbd_iidtx4_c, aom_highbd_idct8_c },       // H_DCT
+    { aom_highbd_iadst4_c, highbd_iidtx8_c },      // V_ADST
+    { highbd_iidtx4_c, aom_highbd_iadst8_c },      // H_ADST
+    { aom_highbd_iadst4_c, highbd_iidtx8_c },      // V_FLIPADST
+    { highbd_iidtx4_c, aom_highbd_iadst8_c },      // H_FLIPADST
+#endif                                             // CONFIG_EXT_TX
+  };
+  const int n = 4;
+  const int n2 = 8;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[8][4], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_8x4[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < n2; ++j) {
+      out[j][i] =
+          HIGHBD_WRAPLOW(highbd_dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    }
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_8x4[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+
+void av1_highbd_iht8x16_128_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_8x16[] = {
+    { aom_highbd_idct16_c, aom_highbd_idct8_c },    // DCT_DCT
+    { aom_highbd_iadst16_c, aom_highbd_idct8_c },   // ADST_DCT
+    { aom_highbd_idct16_c, aom_highbd_iadst8_c },   // DCT_ADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst16_c, aom_highbd_idct8_c },   // FLIPADST_DCT
+    { aom_highbd_idct16_c, aom_highbd_iadst8_c },   // DCT_FLIPADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst8_c },  // ADST_FLIPADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst8_c },  // FLIPADST_ADST
+    { highbd_iidtx16_c, highbd_iidtx8_c },          // IDTX
+    { aom_highbd_idct16_c, highbd_iidtx8_c },       // V_DCT
+    { highbd_iidtx16_c, aom_highbd_idct8_c },       // H_DCT
+    { aom_highbd_iadst16_c, highbd_iidtx8_c },      // V_ADST
+    { highbd_iidtx16_c, aom_highbd_iadst8_c },      // H_ADST
+    { aom_highbd_iadst16_c, highbd_iidtx8_c },      // V_FLIPADST
+    { highbd_iidtx16_c, aom_highbd_iadst8_c },      // H_FLIPADST
+#endif                                              // CONFIG_EXT_TX
+  };
+  const int n = 8;
+  const int n2 = 16;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[8][16], outtmp[8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_8x16[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < n; ++j)
+      out[j][i] =
+          HIGHBD_WRAPLOW(highbd_dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_8x16[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht16x8_128_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_16x8[] = {
+    { aom_highbd_idct8_c, aom_highbd_idct16_c },    // DCT_DCT
+    { aom_highbd_iadst8_c, aom_highbd_idct16_c },   // ADST_DCT
+    { aom_highbd_idct8_c, aom_highbd_iadst16_c },   // DCT_ADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst8_c, aom_highbd_idct16_c },   // FLIPADST_DCT
+    { aom_highbd_idct8_c, aom_highbd_iadst16_c },   // DCT_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst16_c },  // ADST_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst16_c },  // FLIPADST_ADST
+    { highbd_iidtx8_c, highbd_iidtx16_c },          // IDTX
+    { aom_highbd_idct8_c, highbd_iidtx16_c },       // V_DCT
+    { highbd_iidtx8_c, aom_highbd_idct16_c },       // H_DCT
+    { aom_highbd_iadst8_c, highbd_iidtx16_c },      // V_ADST
+    { highbd_iidtx8_c, aom_highbd_iadst16_c },      // H_ADST
+    { aom_highbd_iadst8_c, highbd_iidtx16_c },      // V_FLIPADST
+    { highbd_iidtx8_c, aom_highbd_iadst16_c },      // H_FLIPADST
+#endif                                              // CONFIG_EXT_TX
+  };
+  const int n = 8;
+  const int n2 = 16;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[16][8], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_16x8[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < n2; ++j)
+      out[j][i] =
+          HIGHBD_WRAPLOW(highbd_dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_16x8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht16x32_512_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_16x32[] = {
+    { aom_highbd_idct32_c, aom_highbd_idct16_c },     // DCT_DCT
+    { highbd_ihalfright32_c, aom_highbd_idct16_c },   // ADST_DCT
+    { aom_highbd_idct32_c, aom_highbd_iadst16_c },    // DCT_ADST
+    { highbd_ihalfright32_c, aom_highbd_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_ihalfright32_c, aom_highbd_idct16_c },   // FLIPADST_DCT
+    { aom_highbd_idct32_c, aom_highbd_iadst16_c },    // DCT_FLIPADST
+    { highbd_ihalfright32_c, aom_highbd_iadst16_c },  // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c, aom_highbd_iadst16_c },  // ADST_FLIPADST
+    { highbd_ihalfright32_c, aom_highbd_iadst16_c },  // FLIPADST_ADST
+    { highbd_iidtx32_c, highbd_iidtx16_c },           // IDTX
+    { aom_highbd_idct32_c, highbd_iidtx16_c },        // V_DCT
+    { highbd_iidtx32_c, aom_highbd_idct16_c },        // H_DCT
+    { highbd_ihalfright32_c, highbd_iidtx16_c },      // V_ADST
+    { highbd_iidtx32_c, aom_highbd_iadst16_c },       // H_ADST
+    { highbd_ihalfright32_c, highbd_iidtx16_c },      // V_FLIPADST
+    { highbd_iidtx32_c, aom_highbd_iadst16_c },       // H_FLIPADST
+#endif                                                // CONFIG_EXT_TX
+  };
+  const int n = 16;
+  const int n2 = 32;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[16][32], outtmp[16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n2;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_16x32[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < n; ++j)
+      out[j][i] =
+          HIGHBD_WRAPLOW(highbd_dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_16x32[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n2, n);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+
+void av1_highbd_iht32x16_512_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32x16[] = {
+    { aom_highbd_idct16_c, aom_highbd_idct32_c },     // DCT_DCT
+    { aom_highbd_iadst16_c, aom_highbd_idct32_c },    // ADST_DCT
+    { aom_highbd_idct16_c, highbd_ihalfright32_c },   // DCT_ADST
+    { aom_highbd_iadst16_c, highbd_ihalfright32_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst16_c, aom_highbd_idct32_c },    // FLIPADST_DCT
+    { aom_highbd_idct16_c, highbd_ihalfright32_c },   // DCT_FLIPADST
+    { aom_highbd_iadst16_c, highbd_ihalfright32_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst16_c, highbd_ihalfright32_c },  // ADST_FLIPADST
+    { aom_highbd_iadst16_c, highbd_ihalfright32_c },  // FLIPADST_ADST
+    { highbd_iidtx16_c, highbd_iidtx32_c },           // IDTX
+    { aom_highbd_idct16_c, highbd_iidtx32_c },        // V_DCT
+    { highbd_iidtx16_c, aom_highbd_idct32_c },        // H_DCT
+    { aom_highbd_iadst16_c, highbd_iidtx32_c },       // V_ADST
+    { highbd_iidtx16_c, highbd_ihalfright32_c },      // H_ADST
+    { aom_highbd_iadst16_c, highbd_iidtx32_c },       // V_FLIPADST
+    { highbd_iidtx16_c, highbd_ihalfright32_c },      // H_FLIPADST
+#endif                                                // CONFIG_EXT_TX
+  };
+  const int n = 16;
+  const int n2 = 32;
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[32][16], outtmp[32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = n;
+
+  // inverse transform row vectors, and transpose
+  for (i = 0; i < n; ++i) {
+    HIGH_IHT_32x16[tx_type].rows(input, outtmp, bd);
+    for (j = 0; j < n2; ++j)
+      out[j][i] =
+          HIGHBD_WRAPLOW(highbd_dct_const_round_shift(outtmp[j] * Sqrt2), bd);
+    input += n2;
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < n2; ++i) {
+    HIGH_IHT_32x16[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, n, n2);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
 
 void av1_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
+  static const highbd_transform_2d HIGH_IHT_8[] = {
+    { aom_highbd_idct8_c, aom_highbd_idct8_c },    // DCT_DCT
+    { aom_highbd_iadst8_c, aom_highbd_idct8_c },   // ADST_DCT
+    { aom_highbd_idct8_c, aom_highbd_iadst8_c },   // DCT_ADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst8_c, aom_highbd_idct8_c },   // FLIPADST_DCT
+    { aom_highbd_idct8_c, aom_highbd_iadst8_c },   // DCT_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst8_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst8_c },  // ADST_FLIPADST
+    { aom_highbd_iadst8_c, aom_highbd_iadst8_c },  // FLIPADST_ADST
+    { highbd_iidtx8_c, highbd_iidtx8_c },          // IDTX
+    { aom_highbd_idct8_c, highbd_iidtx8_c },       // V_DCT
+    { highbd_iidtx8_c, aom_highbd_idct8_c },       // H_DCT
+    { aom_highbd_iadst8_c, highbd_iidtx8_c },      // V_ADST
+    { highbd_iidtx8_c, aom_highbd_iadst8_c },      // H_ADST
+    { aom_highbd_iadst8_c, highbd_iidtx8_c },      // V_FLIPADST
+    { highbd_iidtx8_c, aom_highbd_iadst8_c },      // H_FLIPADST
+#endif                                             // CONFIG_EXT_TX
+  };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Inverse transform row vectors.
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr, bd);
+    HIGH_IHT_8[tx_type].rows(input, out[i], bd);
     input += 8;
-    outptr += 8;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out, bd);
+    HIGH_IHT_8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
     }
   }
 }
 
-static const highbd_transform_2d HIGH_IHT_16[] = {
-  { aom_highbd_idct16_c, aom_highbd_idct16_c },   // DCT_DCT  = 0
-  { aom_highbd_iadst16_c, aom_highbd_idct16_c },  // ADST_DCT = 1
-  { aom_highbd_idct16_c, aom_highbd_iadst16_c },  // DCT_ADST = 2
-  { aom_highbd_iadst16_c, aom_highbd_iadst16_c }  // ADST_ADST = 3
-};
-
 void av1_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
+  static const highbd_transform_2d HIGH_IHT_16[] = {
+    { aom_highbd_idct16_c, aom_highbd_idct16_c },    // DCT_DCT
+    { aom_highbd_iadst16_c, aom_highbd_idct16_c },   // ADST_DCT
+    { aom_highbd_idct16_c, aom_highbd_iadst16_c },   // DCT_ADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { aom_highbd_iadst16_c, aom_highbd_idct16_c },   // FLIPADST_DCT
+    { aom_highbd_idct16_c, aom_highbd_iadst16_c },   // DCT_FLIPADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst16_c },  // FLIPADST_FLIPADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst16_c },  // ADST_FLIPADST
+    { aom_highbd_iadst16_c, aom_highbd_iadst16_c },  // FLIPADST_ADST
+    { highbd_iidtx16_c, highbd_iidtx16_c },          // IDTX
+    { aom_highbd_idct16_c, highbd_iidtx16_c },       // V_DCT
+    { highbd_iidtx16_c, aom_highbd_idct16_c },       // H_DCT
+    { aom_highbd_iadst16_c, highbd_iidtx16_c },      // V_ADST
+    { highbd_iidtx16_c, aom_highbd_iadst16_c },      // H_ADST
+    { aom_highbd_iadst16_c, highbd_iidtx16_c },      // V_FLIPADST
+    { highbd_iidtx16_c, aom_highbd_iadst16_c },      // H_FLIPADST
+#endif                                               // CONFIG_EXT_TX
+  };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr, bd);
+    HIGH_IHT_16[tx_type].rows(input, out[i], bd);
     input += 16;
-    outptr += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out, bd);
+    HIGH_IHT_16[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
     }
   }
 }
 
+#if CONFIG_EXT_TX
+void av1_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32[] = {
+    { aom_highbd_idct32_c, aom_highbd_idct32_c },      // DCT_DCT
+    { highbd_ihalfright32_c, aom_highbd_idct32_c },    // ADST_DCT
+    { aom_highbd_idct32_c, highbd_ihalfright32_c },    // DCT_ADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // ADST_ADST
+    { highbd_ihalfright32_c, aom_highbd_idct32_c },    // FLIPADST_DCT
+    { aom_highbd_idct32_c, highbd_ihalfright32_c },    // DCT_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // ADST_FLIPADST
+    { highbd_ihalfright32_c, highbd_ihalfright32_c },  // FLIPADST_ADST
+    { highbd_iidtx32_c, highbd_iidtx32_c },            // IDTX
+    { aom_highbd_idct32_c, highbd_iidtx32_c },         // V_DCT
+    { highbd_iidtx32_c, aom_highbd_idct32_c },         // H_DCT
+    { highbd_ihalfright32_c, highbd_iidtx32_c },       // V_ADST
+    { highbd_iidtx32_c, highbd_ihalfright32_c },       // H_ADST
+    { highbd_ihalfright32_c, highbd_iidtx32_c },       // V_FLIPADST
+    { highbd_iidtx32_c, highbd_ihalfright32_c },       // H_FLIPADST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].rows(input, out[i], bd);
+    input += 32;
+  }
+
+  // transpose
+  for (i = 1; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_TX64X64
+void av1_highbd_iht64x64_4096_add_c(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_64[] = {
+    { highbd_idct64_col_c, highbd_idct64_row_c },      // DCT_DCT
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // ADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_ADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { highbd_ihalfright64_c, highbd_idct64_row_c },    // FLIPADST_DCT
+    { highbd_idct64_col_c, highbd_ihalfright64_c },    // DCT_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // ADST_FLIPADST
+    { highbd_ihalfright64_c, highbd_ihalfright64_c },  // FLIPADST_ADST
+    { highbd_iidtx64_c, highbd_iidtx64_c },            // IDTX
+    { highbd_idct64_col_c, highbd_iidtx64_c },         // V_DCT
+    { highbd_iidtx64_c, highbd_idct64_row_c },         // H_DCT
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_ADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_ADST
+    { highbd_ihalfright64_c, highbd_iidtx64_c },       // V_FLIPADST
+    { highbd_iidtx64_c, highbd_ihalfright64_c },       // H_FLIPADST
+#endif                                                 // CONFIG_EXT_TX
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[64][64];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 64;
+
+  // inverse transform row vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].rows(input, out[i], bd);
+    for (j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
+    input += 64;
+  }
+
+  // transpose
+  for (i = 1; i < 64; i++) {
+    for (j = 0; j < i; j++) {
+      tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 64; ++i) {
+    HIGH_IHT_64[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 64, 64);
+#endif  // CONFIG_EXT_TX
+
+  // Sum with the destination
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] =
+          highbd_clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5), bd);
+    }
+  }
+}
+#endif  // CONFIG_TX64X64
+
 // idct
 void av1_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd) {
@@ -398,28 +2019,116 @@
   if (lossless) {
     assert(tx_type == DCT_DCT);
     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT: av1_highbd_idct4x4_add(input, dest, stride, eob, bd); break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        av1_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
-        break;
-      default: assert(0); break;
-    }
+    return;
   }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      av1_inv_txfm2d_add_4x4(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_inv_txfm2d_add_4x4(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 4, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht4x8_32_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht8x4_32_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht8x16_128_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8(const tran_low_t *input, uint8_t *dest,
+                                  int stride, int eob, int bd,
+                                  TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht16x8_128_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+                                   int stride, int eob, int bd,
+                                   TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht16x32_512_add_c(input, dest, stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+                                   int stride, int eob, int bd,
+                                   TX_TYPE tx_type) {
+  (void)eob;
+  av1_highbd_iht32x16_512_add_c(input, dest, stride, tx_type, bd);
 }
 
 void av1_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type) {
+  (void)eob;
   switch (tx_type) {
-    case DCT_DCT: av1_highbd_idct8x8_add(input, dest, stride, eob, bd); break;
+    case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      av1_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+      av1_inv_txfm2d_add_8x8(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_inv_txfm2d_add_8x8(input, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+                             bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 8, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
@@ -427,13 +2136,37 @@
 void av1_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
                                    int stride, int eob, int bd,
                                    TX_TYPE tx_type) {
+  (void)eob;
   switch (tx_type) {
-    case DCT_DCT: av1_highbd_idct16x16_add(input, dest, stride, eob, bd); break;
+    case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      av1_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+      av1_inv_txfm2d_add_16x16(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               tx_type, bd);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_inv_txfm2d_add_16x16(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      av1_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 16, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
@@ -441,12 +2174,168 @@
 void av1_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                    int stride, int eob, int bd,
                                    TX_TYPE tx_type) {
+  (void)eob;
   switch (tx_type) {
-    case DCT_DCT: av1_highbd_idct32x32_add(input, dest, stride, eob, bd); break;
+    case DCT_DCT:
+      av1_inv_txfm2d_add_32x32(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               DCT_DCT, bd);
+      break;
+#if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
-    case ADST_ADST: assert(0); break;
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 32, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
+
+#if CONFIG_TX64X64
+void av1_highbd_inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
+                                   int stride, int eob, int bd,
+                                   TX_TYPE tx_type) {
+  (void)eob;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_inv_txfm2d_add_64x64(input, CONVERT_TO_SHORTPTR(dest), stride,
+                               DCT_DCT, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_iht64x64_4096_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 64, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                  INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      av1_inv_txfm_add_64x64(input, dest, stride, eob, tx_type);
+      break;
+#endif  // CONFIG_TX64X64
+    case TX_32X32:
+      av1_inv_txfm_add_32x32(input, dest, stride, eob, tx_type);
+      break;
+    case TX_16X16:
+      av1_inv_txfm_add_16x16(input, dest, stride, eob, tx_type);
+      break;
+    case TX_8X8: av1_inv_txfm_add_8x8(input, dest, stride, eob, tx_type); break;
+    case TX_4X8: av1_inv_txfm_add_4x8(input, dest, stride, eob, tx_type); break;
+    case TX_8X4: av1_inv_txfm_add_8x4(input, dest, stride, eob, tx_type); break;
+    case TX_8X16:
+      av1_inv_txfm_add_8x16(input, dest, stride, eob, tx_type);
+      break;
+    case TX_16X8:
+      av1_inv_txfm_add_16x8(input, dest, stride, eob, tx_type);
+      break;
+    case TX_16X32:
+      av1_inv_txfm_add_16x32(input, dest, stride, eob, tx_type);
+      break;
+    case TX_32X16:
+      av1_inv_txfm_add_32x16(input, dest, stride, eob, tx_type);
+      break;
+    case TX_4X4:
+      // this is like av1_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      av1_inv_txfm_add_4x4(input, dest, stride, eob, tx_type, lossless);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                         INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int bd = inv_txfm_param->bd;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      av1_highbd_inv_txfm_add_64x64(input, dest, stride, eob, bd, tx_type);
+      break;
+#endif  // CONFIG_TX64X64
+    case TX_32X32:
+      av1_highbd_inv_txfm_add_32x32(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X16:
+      av1_highbd_inv_txfm_add_16x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X8:
+      av1_highbd_inv_txfm_add_8x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_4X8:
+      av1_highbd_inv_txfm_add_4x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X4:
+      av1_highbd_inv_txfm_add_8x4(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X16:
+      av1_highbd_inv_txfm_add_8x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X8:
+      av1_highbd_inv_txfm_add_16x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X32:
+      av1_highbd_inv_txfm_add_16x32(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_32X16:
+      av1_highbd_inv_txfm_add_32x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_4X4:
+      // this is like av1_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      av1_highbd_inv_txfm_add_4x4(input, dest, stride, eob, bd, tx_type,
+                                  lossless);
+      break;
+    default: assert(0 && "Invalid transform size"); break;
+  }
+}
 #endif  // CONFIG_AOM_HIGHBITDEPTH

diff --git a/av1/common/idct.h b/av1/common/idct.h
index a667aac..db9a6e2 100644
--- a/av1/common/idct.h
+++ b/av1/common/idct.h

@@ -15,6 +15,7 @@
 #include <assert.h>
 
 #include "./aom_config.h"
+#include "av1/common/blockd.h"
 #include "av1/common/common.h"
 #include "av1/common/enums.h"
 #include "aom_dsp/inv_txfm.h"
@@ -25,6 +26,16 @@
 extern "C" {
 #endif
 
+typedef struct INV_TXFM_PARAM {
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int eob;
+  int lossless;
+#if CONFIG_AOM_HIGHBITDEPTH
+  int bd;
+#endif
+} INV_TXFM_PARAM;
+
 typedef void (*transform_1d)(const tran_low_t *, tran_low_t *);
 
 typedef struct {
@@ -39,20 +50,35 @@
 } highbd_transform_2d;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
+#define MAX_TX_SCALE 1
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+                 const TX_SIZE tx_size);
+
 void av1_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);
 void av1_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);
+void av1_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void av1_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
+void av1_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
 
 void av1_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type, int lossless);
+void av1_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type);
+void av1_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, TX_TYPE tx_type);
 void av1_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest, int stride,
                           int eob, TX_TYPE tx_type);
 void av1_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, TX_TYPE tx_type);
 void av1_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, TX_TYPE tx_type);
-
+void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                  INV_TXFM_PARAM *inv_txfm_param);
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd);
@@ -67,6 +93,10 @@
 void av1_highbd_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type,
                                  int lossless);
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type);
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+                                 int stride, int eob, int bd, TX_TYPE tx_type);
 void av1_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                  int stride, int eob, int bd, TX_TYPE tx_type);
 void av1_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
@@ -75,6 +105,8 @@
 void av1_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                    int stride, int eob, int bd,
                                    TX_TYPE tx_type);
+void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                         INV_TXFM_PARAM *inv_txfm_param);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c
index 5d6b87e..25ce24a 100644
--- a/av1/common/loopfilter.c
+++ b/av1/common/loopfilter.c

@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <math.h>
+
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
 #include "av1/common/loopfilter.h"
@@ -66,7 +68,7 @@
 // A loopfilter should be applied to every other 4 the row vertically.
 static const uint64_t above_64x64_txform_mask[TX_SIZES] = {
 #if CONFIG_CB4X4
-  0xffffffffffffffffULL,  // TX_2X2
+  0xffffffffffffffffULL,  // TX_4X4
 #endif
   0xffffffffffffffffULL,  // TX_4X4
   0xffffffffffffffffULL,  // TX_8x8
@@ -220,6 +222,11 @@
 static const int mode_lf_lut[MB_MODE_COUNT] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
   1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
+#if CONFIG_EXT_INTER
+  ,
+  1,                            // NEWFROMNEARMV mode
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 1  // INTER_COMPOUND_MODES (ZERO_ZEROMV == 0)
+#endif                          // CONFIG_EXT_INTER
 };
 
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
@@ -245,8 +252,16 @@
 
 static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
                                 const MB_MODE_INFO *mbmi) {
-  return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]]
-                   [mode_lf_lut[mbmi->mode]];
+#if CONFIG_SUPERTX
+  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+  assert(
+      IMPLIES(supertx_enabled(mbmi), mbmi->segment_id_supertx != MAX_SEGMENTS));
+  assert(IMPLIES(supertx_enabled(mbmi),
+                 mbmi->segment_id_supertx <= mbmi->segment_id));
+#else
+  const int segment_id = mbmi->segment_id;
+#endif  // CONFIG_SUPERTX
+  return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
 }
 
 void av1_loop_filter_init(AV1_COMMON *cm) {
@@ -297,7 +312,7 @@
       const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
       lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
 
-      for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) {
+      for (ref = LAST_FRAME; ref < TOTAL_REFS_PER_FRAME; ++ref) {
         for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
           const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
                                 lf->mode_deltas[mode] * scale;
@@ -705,8 +720,11 @@
                         const int shift_uv, LOOP_FILTER_MASK *lfm) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
   const BLOCK_SIZE block_size = mbmi->sb_type;
-  const TX_SIZE tx_size_y = mbmi->tx_size;
-  const TX_SIZE tx_size_uv = get_uv_tx_size_impl(tx_size_y, block_size, 1, 1);
+  // TODO(debargha): Check if masks can be setup correctly when
+  // rectangular transfroms are used with the EXT_TX expt.
+  const TX_SIZE tx_size_y = txsize_sqr_up_map[mbmi->tx_size];
+  const TX_SIZE tx_size_uv =
+      txsize_sqr_up_map[uv_txsize_lookup[block_size][mbmi->tx_size][1][1]];
   const int filter_level = get_filter_level(lfi_n, mbmi);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
   uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -722,11 +740,10 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    int index = shift_y;
-    for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[index], filter_level, w);
-      index += 8;
-    }
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
   }
 
   // These set 1 in the current block size for the block size edges.
@@ -748,7 +765,7 @@
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+  if (mbmi->skip && is_inter_block(mbmi)) return;
 
   // Here we are adding a mask for the transform size. The transform
   // size mask is set to be correct for a 64x64 prediction block size. We
@@ -781,10 +798,18 @@
 // we only update u and v masks on the first block.
 static void build_y_mask(const loop_filter_info_n *const lfi_n,
                          const MODE_INFO *mi, const int shift_y,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
                          LOOP_FILTER_MASK *lfm) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
+  const TX_SIZE tx_size_y = txsize_sqr_up_map[mbmi->tx_size];
+#if CONFIG_SUPERTX
+  const BLOCK_SIZE block_size =
+      supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
+#else
   const BLOCK_SIZE block_size = mbmi->sb_type;
-  const TX_SIZE tx_size_y = mbmi->tx_size;
+#endif
   const int filter_level = get_filter_level(lfi_n, mbmi);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
   uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -796,17 +821,16 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    int index = shift_y;
-    for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[index], filter_level, w);
-      index += 8;
-    }
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++) memset(&lfm->lfl_y[row + i][col], filter_level, w);
   }
 
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-  if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi)) return;
+  if (mbmi->skip && is_inter_block(mbmi)) return;
 
   *above_y |= (size_mask[block_size] & above_64x64_txform_mask[tx_size_y])
               << shift_y;
@@ -849,12 +873,11 @@
   const int shift_32_uv[] = { 0, 2, 8, 10 };
   const int shift_16_uv[] = { 0, 1, 4, 5 };
   int i;
-  const int max_rows =
-      (mi_row + MAX_MIB_SIZE > cm->mi_rows ? cm->mi_rows - mi_row
-                                           : MAX_MIB_SIZE);
-  const int max_cols =
-      (mi_col + MAX_MIB_SIZE > cm->mi_cols ? cm->mi_cols - mi_col
-                                           : MAX_MIB_SIZE);
+  const int max_rows = AOMMIN(cm->mi_rows - mi_row, MAX_MIB_SIZE);
+  const int max_cols = AOMMIN(cm->mi_cols - mi_col, MAX_MIB_SIZE);
+#if CONFIG_EXT_PARTITION
+  assert(0 && "Not yet updated");
+#endif  // CONFIG_EXT_PARTITION
 
   av1_zero(*lfm);
   assert(mip[0] != NULL);
@@ -890,17 +913,29 @@
             break;
           case BLOCK_32X16:
             build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
             if (mi_32_row_offset + 2 >= max_rows) continue;
             mip2 = mip + mode_info_stride * 2;
             build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm);
             break;
           case BLOCK_16X32:
             build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
             if (mi_32_col_offset + 2 >= max_cols) continue;
             mip2 = mip + 2;
             build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm);
             break;
           default:
+#if CONFIG_SUPERTX
+            if (mip[0]->mbmi.tx_size == TX_32X32) {
+              build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
+              break;
+            }
+#endif
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
               const int shift_y_32_16 = shift_y_32 + shift_16_y[idx_16];
               const int shift_uv_32_16 = shift_uv_32 + shift_16_uv[idx_16];
@@ -918,21 +953,42 @@
                               lfm);
                   break;
                 case BLOCK_16X8:
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
                   build_masks(lfi_n, mip[0], shift_y_32_16, shift_uv_32_16,
                               lfm);
                   if (mi_16_row_offset + 1 >= max_rows) continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y_32_16 + 8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_32_16 + 8,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                   break;
                 case BLOCK_8X16:
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip[0]->mbmi)) break;
+#endif
                   build_masks(lfi_n, mip[0], shift_y_32_16, shift_uv_32_16,
                               lfm);
                   if (mi_16_col_offset + 1 >= max_cols) continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y_32_16 + 1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_32_16 + 1,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                   break;
                 default: {
                   const int shift_y_32_16_8_zero = shift_y_32_16 + shift_8_y[0];
+#if CONFIG_SUPERTX
+                  if (mip[0]->mbmi.tx_size == TX_16X16) {
+                    build_masks(lfi_n, mip[0], shift_y_32_16_8_zero,
+                                shift_uv_32_16, lfm);
+                    break;
+                  }
+#endif
                   build_masks(lfi_n, mip[0], shift_y_32_16_8_zero,
                               shift_uv_32_16, lfm);
                   mip += offset[0];
@@ -947,7 +1003,11 @@
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip[0], shift_y_32_16_8, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y_32_16_8,
+#if CONFIG_SUPERTX
+                                 supertx_enabled(&mip[0]->mbmi),
+#endif
+                                 lfm);
                   }
                   break;
                 }
@@ -982,8 +1042,9 @@
     const uint64_t rows = cm->mi_rows - mi_row;
 
     // Each pixel inside the border gets a 1,
-    const uint64_t mask_y = (((uint64_t)1 << (rows << 3)) - 1);
-    const uint16_t mask_uv = (((uint16_t)1 << (((rows + 1) >> 1) << 2)) - 1);
+    const uint64_t mask_y = (((uint64_t)1 << (rows << MAX_MIB_SIZE_LOG2)) - 1);
+    const uint16_t mask_uv =
+        (((uint16_t)1 << (((rows + 1) >> 1) << (MAX_MIB_SIZE_LOG2 - 1))) - 1);
 
     // Remove values completely outside our border.
     for (i = 0; i < TX_32X32; i++) {
@@ -1138,7 +1199,7 @@
 
 void av1_filter_block_plane_non420_ver(AV1_COMMON *cm,
                                        struct macroblockd_plane *plane,
-                                       MODE_INFO **mi_8x8, int mi_row,
+                                       MODE_INFO **mib, int mi_row,
                                        int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
@@ -1151,70 +1212,136 @@
   unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
   unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
   unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
-  uint8_t lfl[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } };
   int r, c;
-  MODE_INFO **tmp_mi = mi_8x8;
+  MODE_INFO **tmp_mi = mib;
 
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
     unsigned int mask_8x8_c = 0;
     unsigned int mask_4x4_c = 0;
     unsigned int border_mask;
 
     // Determine the vertical edges that need filtering
-    for (c = 0; c < MAX_MIB_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+    for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = tmp_mi[c];
-      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
+      const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+      const BLOCK_SIZE sb_type = mbmi->sb_type;
+      const int skip_this = mbmi->skip && is_inter_block(mbmi);
+      const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+      const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
+
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left =
-          (num_4x4_blocks_wide_lookup[sb_type] > 1)
-              ? !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1))
-              : 1;
+          (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1;
       const int skip_this_c = skip_this && !block_edge_left;
       // top edge of current unit is block/partition edge -> no skip
       const int block_edge_above =
-          (num_4x4_blocks_high_lookup[sb_type] > 1)
-              ? !(r & (num_8x8_blocks_high_lookup[sb_type] - 1))
-              : 1;
+          (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1;
       const int skip_this_r = skip_this && !block_edge_above;
-      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                                  ? get_uv_tx_size(&mi[0].mbmi, plane)
-                                  : mi[0].mbmi.tx_size;
+
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
+                               ? mbmi->tx_size
+                               : mbmi->inter_tx_size[blk_row][blk_col];
+#else
+      const TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
+#endif
+#endif
+
+      TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                            ? get_uv_tx_size(mbmi, plane)
+                            : mbmi->tx_size;
+
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
+      TX_SIZE tx_size_c = tx_size_wide_unit[tx_size];
+      TX_SIZE tx_size_r = tx_size_high_unit[tx_size];
+
+      int tx_size_mask = 0;
       const int c_step = (c >> ss_x);
       const int r_step = (r >> ss_y);
       const int col_mask = 1 << c_step;
 
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi) && !mbmi->skip)
+        tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                      ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
+                      : mb_tx_size;
+#endif
+
       // Filter level can vary per MI
-      if (!(lfl[(r << 3) + c_step] =
-                get_filter_level(&cm->lf_info, &mi[0].mbmi)))
-        continue;
+      if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
+
+      if (txsize_sqr_up_map[tx_size] == TX_32X32)
+        tx_size_mask = 3;
+      else if (txsize_sqr_up_map[tx_size] == TX_16X16)
+        tx_size_mask = 1;
+      else
+        tx_size_mask = 0;
+
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      tx_size_r =
+          AOMMIN(txsize_horz_map[tx_size], cm->above_txfm_context[mi_col + c]);
+      tx_size_c = AOMMIN(txsize_vert_map[tx_size],
+                         cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+      cm->above_txfm_context[mi_col + c] = txsize_horz_map[tx_size];
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] =
+          txsize_vert_map[tx_size];
+#else
+      tx_size_r = AOMMIN(tx_size, cm->above_txfm_context[mi_col + c]);
+      tx_size_c =
+          AOMMIN(tx_size, cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+      cm->above_txfm_context[mi_col + c] = tx_size;
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size;
+#endif
+#endif
 
       // Build masks based on the transform size of each block
-      if (tx_size == TX_32X32) {
-        if (!skip_this_c && (c_step & 3) == 0) {
+      // handle vertical mask
+      if (tx_size_c == TX_32X32) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_c)
             mask_16x16_c |= col_mask;
           else
             mask_8x8_c |= col_mask;
         }
-        if (!skip_this_r && (r_step & 3) == 0) {
+      } else if (tx_size_c == TX_16X16) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= col_mask;
+          else
+            mask_8x8_c |= col_mask;
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+          if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= col_mask;
+          else
+            mask_4x4_c |= col_mask;
+        }
+
+        if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
+            (c_step & tx_size_mask) == 0)
+          mask_4x4_int[r] |= col_mask;
+      }
+
+      // set horizontal mask
+      if (tx_size_r == TX_32X32) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= col_mask;
           else
             mask_8x8[r] |= col_mask;
         }
-      } else if (tx_size == TX_16X16) {
-        if (!skip_this_c && (c_step & 1) == 0) {
-          if (!skip_border_4x4_c)
-            mask_16x16_c |= col_mask;
-          else
-            mask_8x8_c |= col_mask;
-        }
-        if (!skip_this_r && (r_step & 1) == 0) {
+      } else if (tx_size_r == TX_16X16) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= col_mask;
           else
@@ -1222,21 +1349,15 @@
         }
       } else {
         // force 8x8 filtering on 32x32 boundaries
-        if (!skip_this_c) {
-          if (tx_size == TX_8X8 || (c_step & 3) == 0)
-            mask_8x8_c |= col_mask;
-          else
-            mask_4x4_c |= col_mask;
-        }
-
-        if (!skip_this_r) {
-          if (tx_size == TX_8X8 || (r_step & 3) == 0)
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+          if (tx_size_r == TX_8X8 || (r_step & 3) == 0)
             mask_8x8[r] |= col_mask;
           else
             mask_4x4[r] |= col_mask;
         }
 
-        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
+        if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
+            ((r >> ss_y) & tx_size_mask) == 0)
           mask_4x4_int[r] |= col_mask;
       }
     }
@@ -1248,30 +1369,30 @@
       highbd_filter_selectively_vert(
           CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
           mask_16x16_c & border_mask, mask_8x8_c & border_mask,
-          mask_4x4_c & border_mask, mask_4x4_int[r], &cm->lf_info, &lfl[r << 3],
+          mask_4x4_c & border_mask, mask_4x4_int[r], &cm->lf_info, &lfl[r][0],
           (int)cm->bit_depth);
     } else {
       filter_selectively_vert(dst->buf, dst->stride, mask_16x16_c & border_mask,
                               mask_8x8_c & border_mask,
                               mask_4x4_c & border_mask, mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              &cm->lf_info, &lfl[r][0]);
     }
 #else
     filter_selectively_vert(dst->buf, dst->stride, mask_16x16_c & border_mask,
                             mask_8x8_c & border_mask, mask_4x4_c & border_mask,
-                            mask_4x4_int[r], &cm->lf_info, &lfl[r << 3]);
+                            mask_4x4_int[r], &cm->lf_info, &lfl[r][0]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-    dst->buf += 8 * dst->stride;
+    dst->buf += MI_SIZE * dst->stride;
     tmp_mi += row_step_stride;
   }
 
-  // restore the buf pointer in case there is additional filter pass.
+  // Now do horizontal pass
   dst->buf = dst0;
 }
 
 void av1_filter_block_plane_non420_hor(AV1_COMMON *cm,
                                        struct macroblockd_plane *plane,
-                                       MODE_INFO **mi_8x8, int mi_row,
+                                       MODE_INFO **mib, int mi_row,
                                        int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
@@ -1284,71 +1405,133 @@
   unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 };
   unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 };
   unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 };
-  uint8_t lfl[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
   int r, c;
-  MODE_INFO **tmp_mi = mi_8x8;
-
-  // re-populate the filter mask for horizontal pass, it is the same as code
-  // in the av1_filter_block_plane_non420_ver
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  MODE_INFO **tmp_mi = mib;
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
     unsigned int mask_8x8_c = 0;
     unsigned int mask_4x4_c = 0;
 
-    // Determine the horizontal edges that need filtering
-    for (c = 0; c < MAX_MIB_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
+    // Determine the vertical edges that need filtering
+    for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
       const MODE_INFO *mi = tmp_mi[c];
-      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
+      const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+      const BLOCK_SIZE sb_type = mbmi->sb_type;
+      const int skip_this = mbmi->skip && is_inter_block(mbmi);
+      const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+      const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
+
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left =
-          (num_4x4_blocks_wide_lookup[sb_type] > 1)
-              ? !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1))
-              : 1;
+          (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1;
       const int skip_this_c = skip_this && !block_edge_left;
       // top edge of current unit is block/partition edge -> no skip
       const int block_edge_above =
-          (num_4x4_blocks_high_lookup[sb_type] > 1)
-              ? !(r & (num_8x8_blocks_high_lookup[sb_type] - 1))
-              : 1;
+          (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1;
       const int skip_this_r = skip_this && !block_edge_above;
-      const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                                  ? get_uv_tx_size(&mi[0].mbmi, plane)
-                                  : mi[0].mbmi.tx_size;
+
+      TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                            ? get_uv_tx_size(mbmi, plane)
+                            : mbmi->tx_size;
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size)
+                               ? mbmi->tx_size
+                               : mbmi->inter_tx_size[blk_row][blk_col];
+#else
+      TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col];
+#endif
+#endif
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
+      TX_SIZE tx_size_c = tx_size_wide_unit[tx_size];
+      TX_SIZE tx_size_r = tx_size_high_unit[tx_size];
+
+      int tx_size_mask = 0;
       const int c_step = (c >> ss_x);
       const int r_step = (r >> ss_y);
       const int col_mask = 1 << c_step;
 
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi) && !mbmi->skip) {
+        tx_size = (plane->plane_type == PLANE_TYPE_UV)
+                      ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y]
+                      : mb_tx_size;
+      }
+#endif
+
       // Filter level can vary per MI
-      if (!(lfl[(r << 3) + c_step] =
-                get_filter_level(&cm->lf_info, &mi[0].mbmi)))
-        continue;
+      if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue;
+
+      if (txsize_sqr_up_map[tx_size] == TX_32X32)
+        tx_size_mask = 3;
+      else if (txsize_sqr_up_map[tx_size] == TX_16X16)
+        tx_size_mask = 1;
+      else
+        tx_size_mask = 0;
+
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      tx_size_r =
+          AOMMIN(txsize_horz_map[tx_size], cm->above_txfm_context[mi_col + c]);
+      tx_size_c = AOMMIN(txsize_vert_map[tx_size],
+                         cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+      cm->above_txfm_context[mi_col + c] = txsize_horz_map[tx_size];
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] =
+          txsize_vert_map[tx_size];
+#else
+      tx_size_r = AOMMIN(tx_size, cm->above_txfm_context[mi_col + c]);
+      tx_size_c =
+          AOMMIN(tx_size, cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+      cm->above_txfm_context[mi_col + c] = tx_size;
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size;
+#endif
+#endif
 
       // Build masks based on the transform size of each block
-      if (tx_size == TX_32X32) {
-        if (!skip_this_c && (c_step & 3) == 0) {
+      // handle vertical mask
+      if (tx_size_c == TX_32X32) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_c)
             mask_16x16_c |= col_mask;
           else
             mask_8x8_c |= col_mask;
         }
-        if (!skip_this_r && (r_step & 3) == 0) {
+      } else if (tx_size_c == TX_16X16) {
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= col_mask;
+          else
+            mask_8x8_c |= col_mask;
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c && (c_step & tx_size_mask) == 0) {
+          if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= col_mask;
+          else
+            mask_4x4_c |= col_mask;
+        }
+
+        if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
+            (c_step & tx_size_mask) == 0)
+          mask_4x4_int[r] |= col_mask;
+      }
+
+      // set horizontal mask
+      if (tx_size_r == TX_32X32) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= col_mask;
           else
             mask_8x8[r] |= col_mask;
         }
-      } else if (tx_size == TX_16X16) {
-        if (!skip_this_c && (c_step & 1) == 0) {
-          if (!skip_border_4x4_c)
-            mask_16x16_c |= col_mask;
-          else
-            mask_8x8_c |= col_mask;
-        }
-        if (!skip_this_r && (r_step & 1) == 0) {
+      } else if (tx_size_r == TX_16X16) {
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= col_mask;
           else
@@ -1356,29 +1539,21 @@
         }
       } else {
         // force 8x8 filtering on 32x32 boundaries
-        if (!skip_this_c) {
-          if (tx_size == TX_8X8 || (c_step & 3) == 0)
-            mask_8x8_c |= col_mask;
-          else
-            mask_4x4_c |= col_mask;
-        }
-
-        if (!skip_this_r) {
-          if (tx_size == TX_8X8 || (r_step & 3) == 0)
+        if (!skip_this_r && (r_step & tx_size_mask) == 0) {
+          if (tx_size_r == TX_8X8 || (r_step & 3) == 0)
             mask_8x8[r] |= col_mask;
           else
             mask_4x4[r] |= col_mask;
         }
 
-        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
+        if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
+            ((r >> ss_y) & tx_size_mask) == 0)
           mask_4x4_int[r] |= col_mask;
       }
     }
-
     tmp_mi += row_step_stride;
   }
-
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
@@ -1400,21 +1575,19 @@
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
                                       mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfl[r << 3], (int)cm->bit_depth);
+                                      &lfl[r][0], (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfl[r << 3]);
+                               &lfl[r][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfl[r << 3]);
+                             &lfl[r][0]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-    dst->buf += 8 * dst->stride;
+    dst->buf += MI_SIZE * dst->stride;
   }
-
-  // restore the buf pointer in case there is additional filter pass.
   dst->buf = dst0;
 }
 
@@ -1432,7 +1605,7 @@
   assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
   // Vertical pass: do 2 rows at one time
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     unsigned int mask_16x16_l = mask_16x16 & 0xffff;
     unsigned int mask_8x8_l = mask_8x8 & 0xffff;
     unsigned int mask_4x4_l = mask_4x4 & 0xffff;
@@ -1444,24 +1617,25 @@
       highbd_filter_selectively_vert_row2(
           plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+          &lfm->lfl_y[r][0], (int)cm->bit_depth);
     } else {
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r][0]);
     }
 #else
     filter_selectively_vert_row2(
         plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r][0]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-    dst->buf += 16 * dst->stride;
-    mask_16x16 >>= 16;
-    mask_8x8 >>= 16;
-    mask_4x4 >>= 16;
-    mask_4x4_int >>= 16;
+    dst->buf += 2 * MI_SIZE * dst->stride;
+    mask_16x16 >>= 2 * MI_SIZE;
+    mask_8x8 >>= 2 * MI_SIZE;
+    mask_4x4 >>= 2 * MI_SIZE;
+    mask_4x4_int >>= 2 * MI_SIZE;
   }
-  // restore the buf pointer in case there is additional filter pass.
+
+  // Horizontal pass
   dst->buf = dst0;
 }
 
@@ -1478,7 +1652,7 @@
 
   assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r++) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
     unsigned int mask_16x16_r;
     unsigned int mask_8x8_r;
     unsigned int mask_4x4_r;
@@ -1497,26 +1671,25 @@
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_horiz(
           CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r][0],
           (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+                               &lfm->lfl_y[r][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << 3]);
+                             &lfm->lfl_y[r][0]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-    dst->buf += 8 * dst->stride;
-    mask_16x16 >>= 8;
-    mask_8x8 >>= 8;
-    mask_4x4 >>= 8;
-    mask_4x4_int >>= 8;
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE;
+    mask_8x8 >>= MI_SIZE;
+    mask_4x4 >>= MI_SIZE;
+    mask_4x4_int >>= MI_SIZE;
   }
-
   // restore the buf pointer in case there is additional filter pass.
   dst->buf = dst0;
 }
@@ -1534,15 +1707,14 @@
   uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
 
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+  assert(plane->plane_type == PLANE_TYPE_UV);
   memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
 
   // Vertical pass: do 2 rows at one time
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-    if (plane->plane_type == 1) {
-      for (c = 0; c < (MAX_MIB_SIZE >> 1); c++) {
-        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
-      }
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (cm->mib_size >> 1); c++) {
+      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
     }
 
     {
@@ -1557,28 +1729,28 @@
         highbd_filter_selectively_vert_row2(
             plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+            &lfm->lfl_uv[r >> 1][0], (int)cm->bit_depth);
       } else {
         filter_selectively_vert_row2(plane->subsampling_x, dst->buf,
                                      dst->stride, mask_16x16_l, mask_8x8_l,
                                      mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-                                     &lfm->lfl_uv[r << 1]);
+                                     &lfm->lfl_uv[r >> 1][0]);
       }
 #else
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_uv[r << 1]);
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 8;
-      mask_8x8 >>= 8;
-      mask_4x4 >>= 8;
-      mask_4x4_int >>= 8;
+      dst->buf += 2 * MI_SIZE * dst->stride;
+      mask_16x16 >>= MI_SIZE;
+      mask_8x8 >>= MI_SIZE;
+      mask_4x4 >>= MI_SIZE;
+      mask_4x4_int >>= MI_SIZE;
     }
   }
 
-  // restore the buf pointer in case there is additional filter pass.
+  // Horizontal pass
   dst->buf = dst0;
 }
 
@@ -1594,20 +1766,18 @@
   uint64_t mask_4x4_int = lfm->above_int_4x4_uv;
 
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
 
   // re-porpulate the filter level for uv, same as the code for vertical
   // filter in av1_filter_block_plane_ss11_ver
-  memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv));
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-    if (plane->plane_type == 1) {
-      for (c = 0; c < (MAX_MIB_SIZE >> 1); c++) {
-        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
-      }
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (cm->mib_size >> 1); c++) {
+      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
     }
   }
 
-  for (r = 0; r < MAX_MIB_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r =
         skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
@@ -1627,28 +1797,27 @@
 
 #if CONFIG_AOM_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                      dst->stride, mask_16x16_r, mask_8x8_r,
-                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int_r, &cm->lf_info, &lfm->lfl_uv[r >> 1][0],
+          (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r << 1]);
+                               &lfm->lfl_uv[r >> 1][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfm->lfl_uv[r << 1]);
+                             &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-    dst->buf += 8 * dst->stride;
-    mask_16x16 >>= 4;
-    mask_8x8 >>= 4;
-    mask_4x4 >>= 4;
-    mask_4x4_int >>= 4;
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE / 2;
+    mask_8x8 >>= MI_SIZE / 2;
+    mask_4x4 >>= MI_SIZE / 2;
+    mask_4x4_int >>= MI_SIZE / 2;
   }
-
   // restore the buf pointer in case there is additional filter pass.
   dst->buf = dst0;
 }
@@ -1656,10 +1825,36 @@
 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+
+#if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
+#endif  // CONFIG_VAR_TX
+  for (mi_row = start; mi_row < stop; mi_row += cm->mib_size) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+#if CONFIG_VAR_TX
+    memset(cm->left_txfm_context, TX_SIZES, MAX_MIB_SIZE);
+#endif  // CONFIG_VAR_TX
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
+      int plane;
+
+      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+      }
+    }
+  }
+#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
   enum lf_path path;
   LOOP_FILTER_MASK lfm;
-  int mi_row, mi_col;
 
   if (y_only)
     path = LF_PATH_444;
@@ -1669,16 +1864,15 @@
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
-
 #if CONFIG_PARALLEL_DEBLOCKING
-  // Filter all the vertical edges in the whole frame
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       int plane;
 
       av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
       av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
 
       av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
@@ -1698,15 +1892,14 @@
       }
     }
   }
-
-  // Filter all the horizontal edges in the whole frame
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       int plane;
 
       av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
       av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
 
       av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
@@ -1729,11 +1922,12 @@
 #else   // CONFIG_PARALLEL_DEBLOCKING
   for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       int plane;
 
       av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
       av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
 
       av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
@@ -1753,12 +1947,14 @@
                                               mi_row, mi_col);
             av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
                                               mi_row, mi_col);
+
             break;
         }
       }
     }
   }
 #endif  // CONFIG_PARALLEL_DEBLOCKING
+#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
 }
 
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,

diff --git a/av1/common/loopfilter.h b/av1/common/loopfilter.h
index 8ac31a5..0f70672 100644
--- a/av1/common/loopfilter.h
+++ b/av1/common/loopfilter.h

@@ -46,8 +46,8 @@
 
   // 0 = Intra, Last, Last2+Last3(CONFIG_EXT_REFS),
   // GF, BRF(CONFIG_EXT_REFS), ARF
-  signed char ref_deltas[MAX_REF_FRAMES];
-  signed char last_ref_deltas[MAX_REF_FRAMES];
+  signed char ref_deltas[TOTAL_REFS_PER_FRAME];
+  signed char last_ref_deltas[TOTAL_REFS_PER_FRAME];
 
   // 0 = ZERO_MV, MV
   signed char mode_deltas[MAX_MODE_LF_DELTAS];
@@ -64,7 +64,7 @@
 
 typedef struct {
   loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
-  uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+  uint8_t lvl[MAX_SEGMENTS][TOTAL_REFS_PER_FRAME][MAX_MODE_LF_DELTAS];
 } loop_filter_info_n;
 
 // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
@@ -84,8 +84,8 @@
   uint16_t above_uv[TX_SIZES];
   uint16_t left_int_4x4_uv;
   uint16_t above_int_4x4_uv;
-  uint8_t lfl_y[64];
-  uint8_t lfl_uv[16];
+  uint8_t lfl_y[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  uint8_t lfl_uv[MAX_MIB_SIZE / 2][MAX_MIB_SIZE / 2];
 } LOOP_FILTER_MASK;
 
 /* assorted loopfilter functions which get used elsewhere */
@@ -105,7 +105,6 @@
 void av1_filter_block_plane_ss00_hor(struct AV1Common *const cm,
                                      struct macroblockd_plane *const plane,
                                      int mi_row, LOOP_FILTER_MASK *lfm);
-
 void av1_filter_block_plane_ss11_ver(struct AV1Common *const cm,
                                      struct macroblockd_plane *const plane,
                                      int mi_row, LOOP_FILTER_MASK *lfm);

diff --git a/av1/common/mv.h b/av1/common/mv.h
index d0cdf69..e5400d9 100644
--- a/av1/common/mv.h
+++ b/av1/common/mv.h

@@ -12,9 +12,8 @@
 #ifndef AV1_COMMON_MV_H_
 #define AV1_COMMON_MV_H_
 
-#include "aom/aom_integer.h"
-
 #include "av1/common/common.h"
+#include "aom_dsp/aom_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,6 +34,132 @@
   int32_t col;
 } MV32;
 
+#if CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+// Bits of precision used for the model
+#define WARPEDMODEL_PREC_BITS 12
+#define WARPEDMODEL_ROW3HOMO_PREC_BITS 12
+
+// Bits of subpel precision for warped interpolation
+#define WARPEDPIXEL_PREC_BITS 6
+#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
+
+// Taps for ntap filter
+#define WARPEDPIXEL_FILTER_TAPS 6
+
+// Precision of filter taps
+#define WARPEDPIXEL_FILTER_BITS 7
+
+#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
+
+typedef enum {
+  UNKNOWN_TRANSFORM = -1,
+  HOMOGRAPHY,   // homography, 8-parameter
+  AFFINE,       // affine, 6-parameter
+  ROTZOOM,      // simplified affine with rotation and zoom only, 4-parameter
+  TRANSLATION,  // translational motion 2-parameter
+  TRANS_TYPES
+} TransformationType;
+
+// number of parameters used by each transformation in TransformationTypes
+static const int n_trans_model_params[TRANS_TYPES] = { 9, 6, 4, 2 };
+
+typedef struct {
+  TransformationType wmtype;
+  int32_t wmmat[8];  // For homography wmmat[9] is assumed to be 1
+} WarpedMotionParams;
+#endif  // CONFIG_GLOBAL_MOTION || CONFIG_WARPED_MOTION
+
+#if CONFIG_GLOBAL_MOTION
+// ALPHA here refers to parameters a and b in rotzoom model:
+// | a   b|
+// |-b   a|
+//
+// and a, b, c, d in affine model:
+// | a   b|
+// | c   d|
+//
+// Anything ending in PREC_BITS is the number of bits of precision
+// to maintain when converting from double to integer.
+//
+// The ABS parameters are used to create an upper and lower bound
+// for each parameter. In other words, after a parameter is integerized
+// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS).
+//
+// XXX_PREC_DIFF and XXX_DECODE_FACTOR
+// are computed once here to prevent repetitive
+// computation on the decoder side. These are
+// to allow the global motion parameters to be encoded in a lower
+// precision than the warped model precision. This means that they
+// need to be changed to warped precision when they are decoded.
+//
+// XX_MIN, XX_MAX are also computed to avoid repeated computation
+
+#define GM_TRANS_PREC_BITS 3
+#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
+#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
+
+#define GM_ALPHA_PREC_BITS 12
+#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
+#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
+
+#define GM_ABS_ALPHA_BITS 9
+#define GM_ABS_TRANS_BITS 9
+
+#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS)
+#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
+#define GM_TRANS_MIN -GM_TRANS_MAX
+#define GM_ALPHA_MIN -GM_ALPHA_MAX
+
+typedef enum {
+  GLOBAL_ZERO = 0,
+  GLOBAL_TRANSLATION = 1,
+  GLOBAL_ROTZOOM = 2,
+  GLOBAL_AFFINE = 3,
+  GLOBAL_MOTION_TYPES
+} GLOBAL_MOTION_TYPE;
+
+typedef struct {
+  GLOBAL_MOTION_TYPE gmtype;
+  WarpedMotionParams motion_params;
+} Global_Motion_Params;
+
+// Convert a global motion translation vector (which may have more bits than a
+// regular motion vector) into a motion vector
+static INLINE int_mv gm_get_motion_vector(const Global_Motion_Params *gm) {
+  int_mv res;
+  res.as_mv.row = (int16_t)ROUND_POWER_OF_TWO_SIGNED(gm->motion_params.wmmat[0],
+                                                     WARPEDMODEL_PREC_BITS - 3);
+  res.as_mv.col = (int16_t)ROUND_POWER_OF_TWO_SIGNED(gm->motion_params.wmmat[1],
+                                                     WARPEDMODEL_PREC_BITS - 3);
+  return res;
+}
+
+static INLINE TransformationType gm_to_trans_type(GLOBAL_MOTION_TYPE gmtype) {
+  switch (gmtype) {
+    case GLOBAL_ZERO: return UNKNOWN_TRANSFORM; break;
+    case GLOBAL_TRANSLATION: return TRANSLATION; break;
+    case GLOBAL_ROTZOOM: return ROTZOOM; break;
+    case GLOBAL_AFFINE: return AFFINE; break;
+    default: assert(0);
+  }
+  return UNKNOWN_TRANSFORM;
+}
+
+static INLINE GLOBAL_MOTION_TYPE get_gmtype(const Global_Motion_Params *gm) {
+  if (!gm->motion_params.wmmat[5] && !gm->motion_params.wmmat[4]) {
+    if (!gm->motion_params.wmmat[3] && !gm->motion_params.wmmat[2]) {
+      return ((!gm->motion_params.wmmat[1] && !gm->motion_params.wmmat[0])
+                  ? GLOBAL_ZERO
+                  : GLOBAL_TRANSLATION);
+    } else {
+      return GLOBAL_ROTZOOM;
+    }
+  } else {
+    return GLOBAL_AFFINE;
+  }
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
 #if CONFIG_REF_MV
 typedef struct candidate_mv {
   int_mv this_mv;
@@ -58,6 +183,9 @@
   mv->row = clamp(mv->row, min_row, max_row);
 }
 
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK);
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/mvref_common.c b/av1/common/mvref_common.c
index 09926c1..f9402e9 100644
--- a/av1/common/mvref_common.c
+++ b/av1/common/mvref_common.c

@@ -8,14 +8,15 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
 #include "av1/common/mvref_common.h"
 
 #if CONFIG_REF_MV
+
 static uint8_t add_ref_mv_candidate(
-    const MACROBLOCKD *xd, const MODE_INFO *const candidate_mi,
-    const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
-    uint8_t *refmv_count, CANDIDATE_MV *ref_mv_stack, const int use_hp, int len,
-    int block, int col) {
+    const MODE_INFO *const candidate_mi, const MB_MODE_INFO *const candidate,
+    const MV_REFERENCE_FRAME rf[2], uint8_t *refmv_count,
+    CANDIDATE_MV *ref_mv_stack, const int use_hp, int len, int block, int col) {
   int index = 0, ref;
   int newmv_count = 0;
 
@@ -25,7 +26,6 @@
       if (candidate->ref_frame[ref] == rf[0]) {
         int_mv this_refmv = get_sub_block_mv(candidate_mi, ref, col, block);
         lower_mv_precision(&this_refmv.as_mv, use_hp);
-        clamp_mv_ref(&this_refmv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
 
         for (index = 0; index < *refmv_count; ++index)
           if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
@@ -40,14 +40,18 @@
           ref_mv_stack[index].weight = 2 * len;
           ++(*refmv_count);
 
-          if (candidate->mode == NEWMV) ++newmv_count;
+#if CONFIG_EXT_INTER
+          if (candidate->mode == NEWMV || candidate->mode == NEWFROMNEARMV)
+#else
+          if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+            ++newmv_count;
         }
 
         if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0) {
           int alt_block = 3 - block;
           this_refmv = get_sub_block_mv(candidate_mi, ref, col, alt_block);
           lower_mv_precision(&this_refmv.as_mv, use_hp);
-          clamp_mv_ref(&this_refmv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
 
           for (index = 0; index < *refmv_count; ++index)
             if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
@@ -62,7 +66,12 @@
             ref_mv_stack[index].weight = len;
             ++(*refmv_count);
 
-            if (candidate->mode == NEWMV) ++newmv_count;
+#if CONFIG_EXT_INTER
+            if (candidate->mode == NEWMV || candidate->mode == NEWFROMNEARMV)
+#else
+            if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+              ++newmv_count;
           }
         }
       }
@@ -70,12 +79,11 @@
   } else {
     // compound reference frame
     if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
-      int_mv this_refmv[2] = { get_sub_block_mv(candidate_mi, 0, col, block),
-                               get_sub_block_mv(candidate_mi, 1, col, block) };
+      int_mv this_refmv[2];
 
       for (ref = 0; ref < 2; ++ref) {
+        this_refmv[ref] = get_sub_block_mv(candidate_mi, ref, col, block);
         lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
-        clamp_mv_ref(&this_refmv[ref].as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
       }
 
       for (index = 0; index < *refmv_count; ++index)
@@ -96,7 +104,12 @@
         ref_mv_stack[index].weight = 2 * len;
         ++(*refmv_count);
 
-        if (candidate->mode == NEWMV) ++newmv_count;
+#if CONFIG_EXT_INTER
+        if (candidate->mode == NEW_NEWMV)
+#else
+        if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+          ++newmv_count;
       }
 
       if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0) {
@@ -104,11 +117,8 @@
         this_refmv[0] = get_sub_block_mv(candidate_mi, 0, col, alt_block);
         this_refmv[1] = get_sub_block_mv(candidate_mi, 1, col, alt_block);
 
-        for (ref = 0; ref < 2; ++ref) {
+        for (ref = 0; ref < 2; ++ref)
           lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
-          clamp_mv_ref(&this_refmv[ref].as_mv, xd->n8_w << 3, xd->n8_h << 3,
-                       xd);
-        }
 
         for (index = 0; index < *refmv_count; ++index)
           if (ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int &&
@@ -128,7 +138,12 @@
           ref_mv_stack[index].weight = len;
           ++(*refmv_count);
 
-          if (candidate->mode == NEWMV) ++newmv_count;
+#if CONFIG_EXT_INTER
+          if (candidate->mode == NEW_NEWMV)
+#else
+          if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+            ++newmv_count;
         }
       }
     }
@@ -146,27 +161,29 @@
 
   for (i = 0; i < xd->n8_w && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
     POSITION mi_pos;
+    const int use_step_16 = (xd->n8_w >= 8);
+
     mi_pos.row = row_offset;
     mi_pos.col = i;
-
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+    if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
       const MODE_INFO *const candidate_mi =
           xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
-      const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
       int len =
-          AOMMIN(xd->n8_w, num_8x8_blocks_wide_lookup[candidate_mbmi->sb_type]);
-      if (xd->n8_w >= 8) len = AOMMAX(2, len);
+          AOMMIN(xd->n8_w, num_8x8_blocks_wide_lookup[candidate->sb_type]);
+      if (use_step_16) len = AOMMAX(2, len);
       newmv_count += add_ref_mv_candidate(
-          xd, candidate_mi, candidate_mbmi, rf, refmv_count, ref_mv_stack,
+          candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
           cm->allow_high_precision_mv, len, block, mi_pos.col);
       i += len;
     } else {
-      if (xd->n8_w >= 8)
+      if (use_step_16)
         i += 2;
       else
         ++i;
     }
   }
+
   return newmv_count;
 }
 
@@ -180,27 +197,29 @@
 
   for (i = 0; i < xd->n8_h && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
     POSITION mi_pos;
+    const int use_step_16 = (xd->n8_h >= 8);
+
     mi_pos.row = i;
     mi_pos.col = col_offset;
-
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
+    if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
       const MODE_INFO *const candidate_mi =
           xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
-      const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
       int len =
-          AOMMIN(xd->n8_h, num_8x8_blocks_high_lookup[candidate_mbmi->sb_type]);
-      if (xd->n8_h >= 8) len = AOMMAX(2, len);
+          AOMMIN(xd->n8_h, num_8x8_blocks_high_lookup[candidate->sb_type]);
+      if (use_step_16) len = AOMMAX(2, len);
       newmv_count += add_ref_mv_candidate(
-          xd, candidate_mi, candidate_mbmi, rf, refmv_count, ref_mv_stack,
+          candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
           cm->allow_high_precision_mv, len, block, mi_pos.col);
       i += len;
     } else {
-      if (xd->n8_h >= 8)
+      if (use_step_16)
         i += 2;
       else
         ++i;
     }
   }
+
   return newmv_count;
 }
 
@@ -216,16 +235,18 @@
   mi_pos.row = row_offset;
   mi_pos.col = col_offset;
 
-  if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos) &&
+  if (is_inside(tile, mi_col, mi_row, &mi_pos) &&
       *refmv_count < MAX_REF_MV_STACK_SIZE) {
     const MODE_INFO *const candidate_mi =
         xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
-    const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
+    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
     const int len = 1;
+
     newmv_count += add_ref_mv_candidate(
-        xd, candidate_mi, candidate_mbmi, rf, refmv_count, ref_mv_stack,
+        candidate_mi, candidate, rf, refmv_count, ref_mv_stack,
         cm->allow_high_precision_mv, len, block, mi_pos.col);
   }  // Analyze a single 8x8 block motion information.
+
   return newmv_count;
 }
 
@@ -262,6 +283,13 @@
   if (xd->n8_w > xd->n8_h)
     if (xd->is_sec_rect) has_tr = 0;
 
+#if CONFIG_EXT_PARTITION_TYPES
+  // The bottom left square of a Vertical A does not have a top right as it is
+  // decoded before the right hand rectangle of the partition
+  if (xd->mi[0]->mbmi.partition == PARTITION_VERT_A)
+    if ((mi_row & bs) && !(mi_col & bs)) has_tr = 0;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
   return has_tr;
 }
 
@@ -275,6 +303,7 @@
   for (rf = 0; rf < 2; ++rf) {
     if (candidate->ref_frame[rf] == ref_frame) {
       const int list_range = AOMMIN(refmv_count, MAX_MV_REF_CANDIDATES);
+
       const int_mv pred_mv = candidate->mv[rf];
       for (idx = 0; idx < list_range; ++idx)
         if (pred_mv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
@@ -304,14 +333,12 @@
   mi_pos.row = blk_row;
   mi_pos.col = blk_col;
 
-  if (!is_inside(&xd->tile, mi_col, mi_row, cm->mi_rows, &mi_pos))
-    return coll_blk_count;
+  if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return coll_blk_count;
 
   for (ref = 0; ref < 2; ++ref) {
     if (prev_frame_mvs->ref_frame[ref] == ref_frame) {
       int_mv this_refmv = prev_frame_mvs->mv[ref];
       lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv);
-      clamp_mv_ref(&this_refmv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
 
       if (abs(this_refmv.as_mv.row) >= 16 || abs(this_refmv.as_mv.col) >= 16)
         mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
@@ -478,18 +505,19 @@
     len = nr_len;
   }
 
+  // TODO(jingning): Clean-up needed.
   if (xd->is_sec_rect) {
     if (xd->n8_w < xd->n8_h) {
       const MODE_INFO *const candidate_mi = xd->mi[-1];
-      const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
-      handle_sec_rect_block(candidate_mbmi, nearest_refmv_count, ref_mv_stack,
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      handle_sec_rect_block(candidate, nearest_refmv_count, ref_mv_stack,
                             ref_frame, mode_context);
     }
 
     if (xd->n8_w > xd->n8_h) {
       const MODE_INFO *const candidate_mi = xd->mi[-xd->mi_stride];
-      const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
-      handle_sec_rect_block(candidate_mbmi, nearest_refmv_count, ref_mv_stack,
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      handle_sec_rect_block(candidate, nearest_refmv_count, ref_mv_stack,
                             ref_frame, mode_context);
     }
   }
@@ -560,7 +588,7 @@
   // and we also need to keep a mode count.
   for (i = 0; i < 2; ++i) {
     const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+    if (is_inside(tile, mi_col, mi_row, mv_ref)) {
       const MODE_INFO *const candidate_mi =
           xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
       const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
@@ -582,7 +610,7 @@
   // mode counts.
   for (; i < MVREF_NEIGHBOURS; ++i) {
     const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+    if (is_inside(tile, mi_col, mi_row, mv_ref)) {
       const MB_MODE_INFO *const candidate =
           &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
 #if CONFIG_SIMP_MV_PRED
@@ -633,7 +661,7 @@
   if (different_ref_found) {
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
       const POSITION *mv_ref = &mv_ref_search[i];
-      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      if (is_inside(tile, mi_col, mi_row, mv_ref)) {
         const MB_MODE_INFO *const candidate =
             &xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]->mbmi;
 #if CONFIG_SIMP_MV_PRED
@@ -677,28 +705,85 @@
 Done:
   if (mode_context)
     mode_context[ref_frame] = counter_to_context[context_counter];
-
   for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
     mv_ref_list[i].as_int = 0;
 }
 
+#if CONFIG_EXT_INTER
+// This function keeps a mode count for a given MB/SB
+void av1_update_mv_context(const MACROBLOCKD *xd, MODE_INFO *mi,
+                           MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list,
+                           int block, int mi_row, int mi_col,
+                           int16_t *mode_context) {
+  int i, refmv_count = 0;
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  int context_counter = 0;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
+  const TileInfo *const tile = &xd->tile;
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are examined only.
+  // If the size < 8x8, we get the mv from the bmi substructure;
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, mv_ref)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      } else if (candidate->ref_frame[1] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      }
+    }
+  }
+
+Done:
+
+  if (mode_context)
+    mode_context[ref_frame] = counter_to_context[context_counter];
+}
+#endif  // CONFIG_EXT_INTER
+
 void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
 #if CONFIG_REF_MV
                       uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
+#if CONFIG_EXT_INTER
+                      int16_t *compound_mode_context,
+#endif  // CONFIG_EXT_INTER
 #endif
                       int_mv *mv_ref_list, int mi_row, int mi_col,
                       find_mv_refs_sync sync, void *const data,
                       int16_t *mode_context) {
 #if CONFIG_REF_MV
   int idx, all_zero = 1;
+#endif
+#if CONFIG_EXT_INTER
+  av1_update_mv_context(xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
+#if CONFIG_REF_MV
+                        compound_mode_context);
+#else
+                        mode_context);
+#endif  // CONFIG_REF_MV
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
   if (ref_frame <= ALTREF_FRAME)
     find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
                      sync, data, mode_context);
 #else
   find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col, sync,
                    data, mode_context);
-#endif
+#endif  // CONFIG_REF_MV
 
 #if CONFIG_REF_MV
   setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
@@ -732,18 +817,26 @@
 
 void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                   CANDIDATE_MV *ref_mv_stack,
+                                   uint8_t *ref_mv_count,
+#endif
+#if CONFIG_EXT_INTER
+                                   int_mv *mv_list,
+#endif  // CONFIG_EXT_INTER
                                    int_mv *nearest_mv, int_mv *near_mv) {
+#if !CONFIG_EXT_INTER
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
+#endif  // !CONFIG_EXT_INTER
   MODE_INFO *const mi = xd->mi[0];
   b_mode_info *bmi = mi->bmi;
   int n;
-
 #if CONFIG_REF_MV
-  CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE];
   CANDIDATE_MV tmp_mv;
-  uint8_t ref_mv_count = 0, idx;
+  uint8_t idx;
   uint8_t above_count = 0, left_count = 0;
   MV_REFERENCE_FRAME rf[2] = { mi->mbmi.ref_frame[ref], NONE };
+  *ref_mv_count = 0;
 #endif
 
   assert(MAX_MV_REF_CANDIDATES == 2);
@@ -753,12 +846,12 @@
 
 #if CONFIG_REF_MV
   scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, -1, 0, ref_mv_stack,
-                &ref_mv_count);
-  above_count = ref_mv_count;
+                ref_mv_count);
+  above_count = *ref_mv_count;
 
   scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf, 0, -1, ref_mv_stack,
-                &ref_mv_count);
-  left_count = ref_mv_count - above_count;
+                ref_mv_count);
+  left_count = *ref_mv_count - above_count;
 
   if (above_count > 1 && left_count > 0) {
     tmp_mv = ref_mv_stack[1];
@@ -766,10 +859,12 @@
     ref_mv_stack[above_count] = tmp_mv;
   }
 
-  for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, ref_mv_count); ++idx) {
+  for (idx = 0; idx < *ref_mv_count; ++idx)
+    clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3,
+                 xd);
+
+  for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *ref_mv_count); ++idx)
     mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
-    clamp_mv_ref(&mv_list[idx].as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
-  }
 #endif
 
   near_mv->as_int = 0;

diff --git a/av1/common/mvref_common.h b/av1/common/mvref_common.h
index b8eecf7..a9478a6 100644
--- a/av1/common/mvref_common.h
+++ b/av1/common/mvref_common.h

@@ -60,6 +60,19 @@
   0,  // NEARMV
   3,  // ZEROMV
   1,  // NEWMV
+#if CONFIG_EXT_INTER
+  1,    // NEWFROMNEARMV
+  0,    // NEAREST_NEARESTMV
+  0,    // NEAREST_NEARMV
+  0,    // NEAR_NEARESTMV
+  0,    // NEAR_NEARMV
+  1,    // NEAREST_NEWMV
+  1,    // NEW_NEARESTMV
+  1,    // NEAR_NEWMV
+  1,    // NEW_NEARMV
+  3,    // ZERO_ZEROMV
+  1,    // NEW_NEWMV
+#endif  // CONFIG_EXT_INTER
 };
 
 // There are 3^3 different combinations of 3 counts that can be either 0,1 or
@@ -205,7 +218,37 @@
     { -1, -1 },
     { -1, 0 },
     { 0, -1 },
-    { -1, 6 } }
+    { -1, 6 } },
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha/jingning) Making them twice the 32x64, .. ones above
+  // 64x128
+  { { 0, -2 },
+    { -2, 0 },
+    { 8, -2 },
+    { -2, 4 },
+    { -2, -2 },
+    { 0, -6 },
+    { -6, 0 },
+    { 4, -2 } },
+  // 128x64
+  { { -2, 0 },
+    { 0, -2 },
+    { -2, 8 },
+    { 4, -2 },
+    { -2, -2 },
+    { -6, 0 },
+    { 0, -6 },
+    { -2, 4 } },
+  // 128x128
+  { { -2, 6 },
+    { 6, -2 },
+    { -2, 8 },
+    { 8, -2 },
+    { -2, -2 },
+    { -2, 0 },
+    { 0, -2 },
+    { -2, 12 } },
+#endif  // CONFIG_EXT_PARTITION
 };
 #endif
 
@@ -214,7 +257,11 @@
 };
 
 // clamp_mv_ref
+#if CONFIG_EXT_PARTITION
+#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
+#else
 #define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
+#endif                      // CONFIG_EXT_PARTITION
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
   clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
@@ -300,10 +347,10 @@
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
-                            int mi_rows, const POSITION *mi_pos) {
-  return !(mi_row + mi_pos->row < 0 ||
+                            const POSITION *mi_pos) {
+  return !(mi_row + mi_pos->row < tile->mi_row_start ||
            mi_col + mi_pos->col < tile->mi_col_start ||
-           mi_row + mi_pos->row >= mi_rows ||
+           mi_row + mi_pos->row >= tile->mi_row_end ||
            mi_col + mi_pos->col >= tile->mi_col_end);
 }
 
@@ -320,6 +367,9 @@
                               int ref_mv_idx) {
   int_mv this_mv = (ref == 0) ? ref_mv_stack[ref_mv_idx].this_mv
                               : ref_mv_stack[ref_mv_idx].comp_mv;
+#if CONFIG_EXT_INTER
+  return 0;
+#endif
 
   if (ref_mv_stack[ref_mv_idx].weight >= REF_CAT_LEVEL && ref_mv_count > 0) {
     if (abs(this_mv.as_mv.row -
@@ -336,9 +386,10 @@
 
 static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
   if (rf[1] > INTRA_FRAME) {
-    return MAX_REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
+    return TOTAL_REFS_PER_FRAME + FWD_RF_OFFSET(rf[0]) +
            BWD_RF_OFFSET(rf[1]) * FWD_REFS;
   }
+
   return rf[0];
 }
 
@@ -356,13 +407,14 @@
 
 static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
                                      int8_t ref_frame_type) {
-  if (ref_frame_type >= MAX_REF_FRAMES) {
-    rf[0] = ref_frame_map[ref_frame_type - MAX_REF_FRAMES][0];
-    rf[1] = ref_frame_map[ref_frame_type - MAX_REF_FRAMES][1];
+  if (ref_frame_type >= TOTAL_REFS_PER_FRAME) {
+    rf[0] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][0];
+    rf[1] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][1];
   } else {
     rf[0] = ref_frame_type;
     rf[1] = NONE;
-    assert(ref_frame_type > INTRA_FRAME && ref_frame_type < MAX_REF_FRAMES);
+    assert(ref_frame_type > INTRA_FRAME &&
+           ref_frame_type < TOTAL_REFS_PER_FRAME);
   }
 }
 
@@ -381,7 +433,12 @@
     return mode_ctx;
   }
 
-  return mode_context[ref_frame_type];
+  if (rf[1] > INTRA_FRAME)
+    return mode_context[rf[0]] & (mode_context[rf[1]] | 0x00ff);
+  else if (rf[0] != ALTREF_FRAME)
+    return mode_context[rf[0]] & ~(mode_context[ALTREF_FRAME] & 0xfe00);
+  else
+    return mode_context[ref_frame_type];
 }
 
 static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
@@ -398,7 +455,6 @@
       ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
     return 3;
 
-  assert(0);
   return 0;
 }
 #endif
@@ -408,6 +464,9 @@
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
 #if CONFIG_REF_MV
                       uint8_t *ref_mv_count, CANDIDATE_MV *ref_mv_stack,
+#if CONFIG_EXT_INTER
+                      int16_t *compound_mode_context,
+#endif  // CONFIG_EXT_INTER
 #endif
                       int_mv *mv_ref_list, int mi_row, int mi_col,
                       find_mv_refs_sync sync, void *const data,
@@ -421,8 +480,23 @@
 
 void av1_append_sub8x8_mvs_for_idx(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                    int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                   CANDIDATE_MV *ref_mv_stack,
+                                   uint8_t *ref_mv_count,
+#endif
+#if CONFIG_EXT_INTER
+                                   int_mv *mv_list,
+#endif  // CONFIG_EXT_INTER
                                    int_mv *nearest_mv, int_mv *near_mv);
 
+#if CONFIG_EXT_INTER
+// This function keeps a mode count for a given MB/SB
+void av1_update_mv_context(const MACROBLOCKD *xd, MODE_INFO *mi,
+                           MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list,
+                           int block, int mi_row, int mi_col,
+                           int16_t *mode_context);
+#endif  // CONFIG_EXT_INTER
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/odintrin.h b/av1/common/odintrin.h
index a6db72f..96131f0 100644
--- a/av1/common/odintrin.h
+++ b/av1/common/odintrin.h

@@ -257,4 +257,4 @@
 }  // extern "C"
 #endif
 
-#endif
+#endif  // AV1_COMMON_ODINTRIN_H_

diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index b4e5166..20270cb 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h

@@ -22,6 +22,11 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/frame_buffers.h"
 #include "av1/common/loopfilter.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#if CONFIG_LOOP_RESTORATION
+#include "av1/common/restoration.h"
+#endif  // CONFIG_LOOP_RESTORATION
 #include "av1/common/tile_common.h"
 #include "av1/common/odintrin.h"
 #if CONFIG_PVQ
@@ -32,8 +37,6 @@
 extern "C" {
 #endif
 
-#define REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
-
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
@@ -70,10 +73,6 @@
 
 typedef enum {
   /**
-   * Don't update frame context
-   */
-  REFRESH_FRAME_CONTEXT_OFF,
-  /**
    * Update frame context to values resulting from forward probability
    * updates signaled in the frame header
    */
@@ -152,9 +151,9 @@
   int subsampling_y;
 
 #if CONFIG_AOM_HIGHBITDEPTH
-  int use_highbitdepth;  // Marks if we need to use 16bit frame buffers.
+  // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
+  int use_highbitdepth;
 #endif
-
 #if CONFIG_CLPF
   // Two bits are used to signal the strength for all blocks and the
   // valid values are:
@@ -194,8 +193,8 @@
   // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
   // roll new_fb_idx into it.
 
-  // Each frame can reference REFS_PER_FRAME buffers
-  RefBuffer frame_refs[REFS_PER_FRAME];
+  // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
+  RefBuffer frame_refs[INTER_REFS_PER_FRAME];
 
   int new_fb_idx;
 
@@ -256,6 +255,10 @@
   int min_qmlevel;
   int max_qmlevel;
 #endif
+#if CONFIG_NEW_QUANT
+  dequant_val_type_nuq y_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
+  dequant_val_type_nuq uv_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
+#endif
 
   /* We allocate a MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
@@ -295,12 +298,16 @@
   InterpFilter interp_filter;
 
   loop_filter_info_n lf_info;
+#if CONFIG_LOOP_RESTORATION
+  RestorationInfo rst_info;
+  RestorationInternal rst_internal;
+#endif  // CONFIG_LOOP_RESTORATION
 
   // Flag signaling how frame contexts should be updated at the end of
   // a frame decode
   REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
 
-  int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */
+  int ref_frame_sign_bias[TOTAL_REFS_PER_FRAME]; /* Two state 0, 1 */
 
   struct loopfilter lf;
   struct segmentation seg;
@@ -313,7 +320,7 @@
   MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
 #else
   MV_REFERENCE_FRAME comp_fixed_ref;
-  MV_REFERENCE_FRAME comp_var_ref[2];
+  MV_REFERENCE_FRAME comp_var_ref[COMP_REFS];
 #endif  // CONFIG_EXT_REFS
   REFERENCE_MODE reference_mode;
 
@@ -322,6 +329,18 @@
   unsigned int frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
+#if CONFIG_ENTROPY
+  // The initial probabilities for a frame, before any subframe backward update,
+  // and after forward update.
+  av1_coeff_probs_model starting_coef_probs[TX_SIZES][PLANE_TYPES];
+  // Number of subframe backward updates already done
+  uint8_t coef_probs_update_idx;
+  // Signal if the backward update is subframe or end-of-frame
+  uint8_t partial_prob_update;
+  // Frame level flag to turn on/off subframe backward update
+  uint8_t do_subframe_update;
+#endif  // CONFIG_ENTROPY
+
   unsigned int current_video_frame;
   BITSTREAM_PROFILE profile;
 
@@ -331,8 +350,12 @@
 
   int error_resilient_mode;
 
+#if !CONFIG_EXT_TILE
   int log2_tile_cols, log2_tile_rows;
-  int tile_sz_mag;
+#endif  // !CONFIG_EXT_TILE
+  int tile_cols, tile_rows;
+  int tile_width, tile_height;  // In MI units
+
   int byte_alignment;
   int skip_loop_filter;
 
@@ -348,7 +371,11 @@
   BufferPool *buffer_pool;
 
   PARTITION_CONTEXT *above_seg_context;
-  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT left_txfm_context[MAX_MIB_SIZE];
+#endif
   int above_context_alloc_cols;
 
   // scratch memory for intraonly/keyframe forward updates from default tables
@@ -358,6 +385,13 @@
 #if CONFIG_DAALA_EC
   aom_cdf_prob kf_y_cdf[INTRA_MODES][INTRA_MODES][INTRA_MODES];
 #endif
+#if CONFIG_GLOBAL_MOTION
+  Global_Motion_Params global_motion[TOTAL_REFS_PER_FRAME];
+#endif
+
+  BLOCK_SIZE sb_size;  // Size of the superblock used for this frame
+  int mib_size;        // Size of the superblock in units of MI blocks
+  int mib_size_log2;   // Log 2 of above.
 #if CONFIG_DERING
   int dering_level;
 #endif
@@ -432,8 +466,12 @@
   bufs[new_idx].ref_count++;
 }
 
-static INLINE int mi_cols_aligned_to_sb(int n_mis) {
-  return ALIGN_POWER_OF_TWO(n_mis, MAX_MIB_SIZE_LOG2);
+static INLINE int mi_cols_aligned_to_sb(const AV1_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->mib_size_log2);
+}
+
+static INLINE int mi_rows_aligned_to_sb(const AV1_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->mib_size_log2);
 }
 
 static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
@@ -446,38 +484,46 @@
 #endif
                                         tran_low_t *dqcoeff) {
   int i;
-
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
 #if CONFIG_PVQ
     xd->plane[i].pvq_ref_coeff = pvq_ref_coeff;
 #endif
-    xd->above_context[i] =
-        cm->above_context +
-        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
-
+    xd->above_context[i] = cm->above_context[i];
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
 #if CONFIG_AOM_QM
       memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
 #endif
+
+#if CONFIG_NEW_QUANT
+      memcpy(xd->plane[i].seg_dequant_nuq, cm->y_dequant_nuq,
+             sizeof(cm->y_dequant_nuq));
+#endif
     } else {
       memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
 #if CONFIG_AOM_QM
       memcpy(xd->plane[i].seg_iqmatrix, cm->uv_iqmatrix,
              sizeof(cm->uv_iqmatrix));
 #endif
+#if CONFIG_NEW_QUANT
+      memcpy(xd->plane[i].seg_dequant_nuq, cm->uv_dequant_nuq,
+             sizeof(cm->uv_dequant_nuq));
+#endif
     }
     xd->fc = cm->fc;
   }
   xd->above_seg_context = cm->above_seg_context;
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context;
+#endif
   xd->mi_stride = cm->mi_stride;
   xd->error_info = &cm->error;
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
   const int above_idx = mi_col * 2;
-  const int left_idx = (mi_row * 2) & 15;
+  const int left_idx = (mi_row * 2) & MAX_MIB_MASK_2;
   int i;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
@@ -499,6 +545,8 @@
     xd->plane[i].n4_h = (bh << 1) >> xd->plane[i].subsampling_y;
     xd->plane[i].n4_wl = bwl - xd->plane[i].subsampling_x;
     xd->plane[i].n4_hl = bhl - xd->plane[i].subsampling_y;
+    xd->plane[i].width = xd->plane[i].n4_w * 4;
+    xd->plane[i].height = xd->plane[i].n4_h * 4;
   }
 }
 
@@ -533,7 +581,6 @@
 
   xd->n8_h = bh;
   xd->n8_w = bw;
-
 #if CONFIG_REF_MV
   xd->is_sec_rect = 0;
   if (xd->n8_w < xd->n8_h)
@@ -572,6 +619,12 @@
   PARTITION_CONTEXT *const left_ctx =
       xd->left_seg_context + (mi_row & MAX_MIB_MASK);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bh);
+#else
   // num_4x4_blocks_wide_lookup[bsize] / 2
   const int bs = num_8x8_blocks_wide_lookup[bsize];
 
@@ -580,8 +633,47 @@
   // bits of smaller block sizes to be zero.
   memset(above_ctx, partition_context_lookup[subsize].above, bs);
   memset(left_ctx, partition_context_lookup[subsize].left, bs);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
+#if CONFIG_EXT_PARTITION_TYPES
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+                                                int mi_col, BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+    BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8) break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
                                           int mi_col, BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
@@ -596,6 +688,169 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+static INLINE int max_block_wide(const MACROBLOCKD *xd, const BLOCK_SIZE bsize,
+                                 const int plane) {
+  int max_blocks_wide = block_size_wide[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  // Scale the width in the transform block unit.
+  return max_blocks_wide >> tx_size_wide_log2[0];
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, const BLOCK_SIZE bsize,
+                                 const int plane) {
+  int max_blocks_high = block_size_high[bsize];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+
+  // Scale the width in the transform block unit.
+  return max_blocks_high >> tx_size_wide_log2[0];
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm,
+                                          int mi_col_start, int mi_col_end) {
+  const int width = mi_col_end - mi_col_start;
+
+  const int offset_y = 2 * mi_col_start;
+  const int width_y = 2 * width;
+  const int offset_uv = offset_y >> cm->subsampling_x;
+  const int width_uv = width_y >> cm->subsampling_x;
+
+  av1_zero_array(cm->above_context[0] + offset_y, width_y);
+  av1_zero_array(cm->above_context[1] + offset_uv, width_uv);
+  av1_zero_array(cm->above_context[2] + offset_uv, width_uv);
+
+  av1_zero_array(cm->above_seg_context + mi_col_start, width);
+
+#if CONFIG_VAR_TX
+  av1_zero_array(cm->above_txfm_context + mi_col_start, width);
+#endif  // CONFIG_VAR_TX
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+  av1_zero(xd->left_context);
+  av1_zero(xd->left_seg_context);
+#if CONFIG_VAR_TX
+  av1_zero(xd->left_txfm_context_buffer);
+#endif
+}
+
+#if CONFIG_VAR_TX
+static INLINE TX_SIZE get_min_tx_size(const TX_SIZE tx_size) {
+  if (tx_size >= TX_SIZES_ALL) assert(0);
+  return txsize_sqr_map[tx_size];
+}
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+  int i;
+  for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n8_w, int n8_h,
+                                 const MACROBLOCKD *xd) {
+  uint8_t bw = tx_size_wide[tx_size];
+  uint8_t bh = tx_size_high[tx_size];
+  set_txfm_ctx(xd->above_txfm_context, bw, n8_w);
+  set_txfm_ctx(xd->left_txfm_context, bh, n8_h);
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int bh = num_8x8_blocks_high_lookup[bsize];
+  int bw = num_8x8_blocks_wide_lookup[bsize];
+  uint8_t txw = tx_size_wide[tx_size];
+  uint8_t txh = tx_size_high[tx_size];
+  int i;
+  for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+  for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         const BLOCK_SIZE bsize,
+                                         const TX_SIZE tx_size) {
+  const uint8_t txw = tx_size_wide[tx_size];
+  const uint8_t txh = tx_size_high[tx_size];
+  const int above = *above_ctx < txw;
+  const int left = *left_ctx < txh;
+  TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int category = 15;
+
+  if (max_tx_size == TX_32X32) {
+    if (tx_size == TX_32X32)
+      category = 0;
+    else
+      category = 1;
+  } else if (max_tx_size == TX_16X16) {
+    if (tx_size == TX_16X16)
+      category = 2;
+    else
+      category = 3;
+  } else if (max_tx_size == TX_8X8) {
+    category = 4;
+  }
+
+  if (category == 15) return category;
+
+  return category * 3 + above + left;
+}
+#endif
+
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+                                           const int mi_row, const int mi_col,
+                                           const BLOCK_SIZE bsize) {
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
+    return PARTITION_INVALID;
+  } else {
+    const int offset = mi_row * cm->mi_stride + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + offset;
+    const MB_MODE_INFO *const mbmi = &mi[0]->mbmi;
+    const int bsl = b_width_log2_lookup[bsize];
+    const PARTITION_TYPE partition = partition_lookup[bsl][mbmi->sb_type];
+#if !CONFIG_EXT_PARTITION_TYPES
+    return partition;
+#else
+    const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+
+    assert(cm->mi_grid_visible[offset] == &cm->mi[offset]);
+
+    if (partition != PARTITION_NONE && bsize > BLOCK_8X8 &&
+        mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+      const BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+      const BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+      const MB_MODE_INFO *const mbmi_right = &mi[hbs]->mbmi;
+      const MB_MODE_INFO *const mbmi_below = &mi[hbs * cm->mi_stride]->mbmi;
+      if (mbmi->sb_type == h) {
+        return mbmi_below->sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+      } else if (mbmi->sb_type == v) {
+        return mbmi_right->sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+      } else if (mbmi_below->sb_type == h) {
+        return PARTITION_HORZ_A;
+      } else if (mbmi_right->sb_type == v) {
+        return PARTITION_VERT_A;
+      } else {
+        return PARTITION_SPLIT;
+      }
+    }
+
+    return partition;
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+  }
+}
+
+static INLINE void set_sb_size(AV1_COMMON *const cm, const BLOCK_SIZE sb_size) {
+  cm->sb_size = sb_size;
+  cm->mib_size = num_8x8_blocks_wide_lookup[cm->sb_size];
+  cm->mib_size_log2 = mi_width_log2_lookup[cm->sb_size];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/pred_common.c b/av1/common/pred_common.c
index 9272f8b..5b7c2ec 100644
--- a/av1/common/pred_common.c
+++ b/av1/common/pred_common.c

@@ -1,23 +1,77 @@
+
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include "av1/common/common.h"
 #include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
 #include "av1/common/seg_common.h"
 
 // Returns a context number for the given MB prediction signal
+#if CONFIG_DUAL_FILTER
+static InterpFilter get_ref_filter_type(const MODE_INFO *mi,
+                                        const MACROBLOCKD *xd, int dir,
+                                        MV_REFERENCE_FRAME ref_frame) {
+  InterpFilter ref_type = SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *ref_mbmi = &mi->mbmi;
+  int use_subpel[2] = {
+    has_subpel_mv_component(mi, xd, dir),
+    has_subpel_mv_component(mi, xd, dir + 2),
+  };
+
+  if (ref_mbmi->ref_frame[0] == ref_frame && use_subpel[0])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01)];
+  else if (ref_mbmi->ref_frame[1] == ref_frame && use_subpel[1])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01) + 2];
+
+  return ref_type;
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx_offset =
+      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
+  MV_REFERENCE_FRAME ref_frame =
+      (dir < 2) ? mbmi->ref_frame[0] : mbmi->ref_frame[1];
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
+  int left_type = SWITCHABLE_FILTERS;
+  int above_type = SWITCHABLE_FILTERS;
+
+  if (xd->left_available)
+    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
+
+  if (xd->up_available)
+    above_type =
+        get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
+
+  if (left_type == above_type)
+    filter_type_ctx += left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    filter_type_ctx += above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    filter_type_ctx += left_type;
+  else
+    filter_type_ctx += SWITCHABLE_FILTERS;
+
+  return filter_type_ctx;
+}
+#else
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
   const int left_type = xd->left_available && is_inter_block(left_mbmi)
                             ? left_mbmi->interp_filter
@@ -36,6 +90,57 @@
   else
     return SWITCHABLE_FILTERS;
 }
+#endif
+
+#if CONFIG_EXT_INTRA
+// Obtain the reference filter type from the above/left neighbor blocks.
+static INTRA_FILTER get_ref_intra_filter(const MB_MODE_INFO *ref_mbmi) {
+  INTRA_FILTER ref_type = INTRA_FILTERS;
+
+  if (ref_mbmi->sb_type >= BLOCK_8X8) {
+    PREDICTION_MODE mode = ref_mbmi->mode;
+    if (is_inter_block(ref_mbmi)) {
+#if CONFIG_DUAL_FILTER
+      switch (ref_mbmi->interp_filter[0]) {
+#else
+      switch (ref_mbmi->interp_filter) {
+#endif
+        case EIGHTTAP_REGULAR: ref_type = INTRA_FILTER_8TAP; break;
+        case EIGHTTAP_SMOOTH: ref_type = INTRA_FILTER_8TAP_SMOOTH; break;
+        case MULTITAP_SHARP: ref_type = INTRA_FILTER_8TAP_SHARP; break;
+        case BILINEAR: ref_type = INTRA_FILTERS; break;
+        default: break;
+      }
+    } else {
+      if (mode != DC_PRED && mode != TM_PRED) {
+        int p_angle =
+            mode_to_angle_map[mode] + ref_mbmi->angle_delta[0] * ANGLE_STEP;
+        if (av1_is_intra_filter_switchable(p_angle)) {
+          ref_type = ref_mbmi->intra_filter;
+        }
+      }
+    }
+  }
+  return ref_type;
+}
+
+int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd) {
+  int left_type = INTRA_FILTERS, above_type = INTRA_FILTERS;
+
+  if (xd->left_available) left_type = get_ref_intra_filter(xd->left_mbmi);
+
+  if (xd->up_available) above_type = get_ref_intra_filter(xd->above_mbmi);
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == INTRA_FILTERS && above_type != INTRA_FILTERS)
+    return above_type;
+  else if (left_type != INTRA_FILTERS && above_type == INTRA_FILTERS)
+    return left_type;
+  else
+    return INTRA_FILTERS;
+}
+#endif  // CONFIG_EXT_INTRA
 
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real macroblocks.
@@ -62,12 +167,11 @@
 }
 
 #if CONFIG_EXT_REFS
-#define CHECK_BWDREF_OR_ALTREF(ref_frame) \
-  (((ref_frame) == BWDREF_FRAME) || ((ref_frame) == ALTREF_FRAME))
-
-#define IS_BWD_PRED_REF_FRAME(ref_frame) CHECK_BWDREF_OR_ALTREF(ref_frame)
+#define CHECK_BACKWARD_REFS(ref_frame) \
+  (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
+#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
 #else
-#define IS_BWD_PRED_REF_FRAME(ref_frame) ((ref_frame) == cm->comp_fixed_ref)
+#define IS_BACKWARD_REF_FRAME(ref_frame) ((ref_frame) == cm->comp_fixed_ref)
 #endif  // CONFIG_EXT_REFS
 
 int av1_get_reference_mode_context(const AV1_COMMON *cm,
@@ -89,15 +193,15 @@
   if (has_above && has_left) {  // both edges available
     if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
       // neither edge uses comp pred (0/1)
-      ctx = IS_BWD_PRED_REF_FRAME(above_mbmi->ref_frame[0]) ^
-            IS_BWD_PRED_REF_FRAME(left_mbmi->ref_frame[0]);
+      ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
+            IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
     else if (!has_second_ref(above_mbmi))
       // one of two edges uses comp pred (2/3)
-      ctx = 2 + (IS_BWD_PRED_REF_FRAME(above_mbmi->ref_frame[0]) ||
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
                  !is_inter_block(above_mbmi));
     else if (!has_second_ref(left_mbmi))
       // one of two edges uses comp pred (2/3)
-      ctx = 2 + (IS_BWD_PRED_REF_FRAME(left_mbmi->ref_frame[0]) ||
+      ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
                  !is_inter_block(left_mbmi));
     else  // both edges use comp pred (4)
       ctx = 4;
@@ -106,7 +210,7 @@
 
     if (!has_second_ref(edge_mbmi))
       // edge does not use comp pred (0/1)
-      ctx = IS_BWD_PRED_REF_FRAME(edge_mbmi->ref_frame[0]);
+      ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
     else
       // edge uses comp pred (3)
       ctx = 3;
@@ -129,12 +233,13 @@
   ((ref_frame == GOLDEN_FRAME) || (ref_frame == LAST3_FRAME))
 
 // Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be either GOLDEN/LAST3,
-// or LAST/LAST2.
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
 //
-// NOTE: The probability of ref_frame[0] is either GOLDEN_FRAME or LAST3_FRAME.
-int av1_get_pred_context_comp_fwdref_p(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd) {
+// NOTE(zoeliu): The probability of ref_frame[0] is either
+//               GOLDEN_FRAME or LAST3_FRAME.
+int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
@@ -177,8 +282,8 @@
       if (frfa == frfl && CHECK_GOLDEN_OR_LAST3(frfa)) {
         pred_context = 0;
       } else if (l_sg && a_sg) {  // single/single
-        if ((CHECK_BWDREF_OR_ALTREF(frfa) && CHECK_LAST_OR_LAST2(frfl)) ||
-            (CHECK_BWDREF_OR_ALTREF(frfl) && CHECK_LAST_OR_LAST2(frfa))) {
+        if ((CHECK_BACKWARD_REFS(frfa) && CHECK_LAST_OR_LAST2(frfl)) ||
+            (CHECK_BACKWARD_REFS(frfl) && CHECK_LAST_OR_LAST2(frfa))) {
           pred_context = 4;
         } else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl)) {
           pred_context = 1;
@@ -228,13 +333,13 @@
 }
 
 // Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be LAST, conditioning
-// on that it is known either LAST/LAST2.
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
 //
-// NOTE: The probability of ref_frame[0] is LAST_FRAME, conditioning on it is
-// either LAST_FRAME or LAST2_FRAME.
-int av1_get_pred_context_comp_fwdref_p1(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd) {
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST_FRAME,
+// conditioning on it is either LAST_FRAME or LAST2_FRAME.
+int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
@@ -280,7 +385,7 @@
         else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl))
           pred_context = 2 + (frfa != frfl);
         else if (frfa == frfl ||
-                 (CHECK_BWDREF_OR_ALTREF(frfa) && CHECK_BWDREF_OR_ALTREF(frfl)))
+                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
           pred_context = 3;
         else
           pred_context = 4;
@@ -329,13 +434,13 @@
 }
 
 // Returns a context number for the given MB prediction signal
-// Signal the first reference frame for a compound mode be GOLDEN, conditioning
-// on that it is known either GOLDEN or LAST3.
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
 //
-// NOTE: The probability of ref_frame[0] is GOLDEN_FRAME, conditioning on it is
-// either GOLDEN or LAST3.
-int av1_get_pred_context_comp_fwdref_p2(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd) {
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME,
+// conditioning on it is either GOLDEN or LAST3.
+int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
@@ -381,7 +486,7 @@
         else if (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl))
           pred_context = 2 + (frfa != frfl);
         else if (frfa == frfl ||
-                 (CHECK_BWDREF_OR_ALTREF(frfa) && CHECK_BWDREF_OR_ALTREF(frfl)))
+                 (CHECK_BACKWARD_REFS(frfa) && CHECK_BACKWARD_REFS(frfl)))
           pred_context = 3;
         else
           pred_context = 4;
@@ -429,11 +534,6 @@
 }
 
 // Returns a context number for the given MB prediction signal
-// Signal the second reference frame for a compound mode be ALTREF, conditioning
-// on that it is known either ALTREF or BWDREF.
-//
-// NOTE: The probability of ref_frame[1] is ALTREF_FRAME, conditioning on it is
-// either ALTREF or BWDREF.
 int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
                                        const MACROBLOCKD *xd) {
   int pred_context;
@@ -459,7 +559,7 @@
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
 
       if (!has_second_ref(edge_mbmi))  // single pred (1/3)
-        pred_context = 1 + 2 * (edge_mbmi->ref_frame[1] != cm->comp_bwd_ref[1]);
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != cm->comp_bwd_ref[1]);
       else  // comp pred (1/3)
         pred_context =
             1 +
@@ -556,8 +656,8 @@
 
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
   const int var_ref_idx = !fix_ref_idx;
 
@@ -635,7 +735,7 @@
 // For the bit to signal whether the single reference is a ALTREF_FRAME
 // or a BWDREF_FRAME.
 //
-// NOTE: The probability of ref_frame[0] is ALTREF/BWDREF.
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF/BWDREF.
 int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -656,51 +756,37 @@
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
 
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]));
-      else
-        pred_context = 1 + (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
-                            !CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[1]));
+      if (!has_second_ref(edge_mbmi))  // single
+        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
+      else  // comp
+        pred_context = 2;
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mbmi);
       const int left_has_second = has_second_ref(left_mbmi);
 
       const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
-      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
-        pred_context = 1 + (!CHECK_BWDREF_OR_ALTREF(above0) ||
-                            !CHECK_BWDREF_OR_ALTREF(above1) ||
-                            !CHECK_BWDREF_OR_ALTREF(left0) ||
-                            !CHECK_BWDREF_OR_ALTREF(left1));
-      } else if (above_has_second || left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
+        pred_context = 2;
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
-        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
 
-        if (!CHECK_BWDREF_OR_ALTREF(rfs))
-          pred_context = 3 + (!CHECK_BWDREF_OR_ALTREF(crf1) ||
-                              !CHECK_BWDREF_OR_ALTREF(crf2));
-        else
-          pred_context =
-              !CHECK_BWDREF_OR_ALTREF(crf1) || !CHECK_BWDREF_OR_ALTREF(crf2);
-      } else {
-        pred_context = 2 * (!CHECK_BWDREF_OR_ALTREF(above0)) +
-                       2 * (!CHECK_BWDREF_OR_ALTREF(left0));
+        pred_context = (!CHECK_BACKWARD_REFS(rfs)) ? 4 : 1;
+      } else {  // single/single
+        pred_context = 2 * (!CHECK_BACKWARD_REFS(above0)) +
+                       2 * (!CHECK_BACKWARD_REFS(left0));
       }
     }
   } else if (has_above || has_left) {  // one edge available
     const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
     if (!is_inter_block(edge_mbmi)) {  // intra
       pred_context = 2;
-    } else {  // inter
-      if (!has_second_ref(edge_mbmi))
-        pred_context = 4 * (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]));
-      else
-        pred_context = 1 + (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
-                            !CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[1]));
+    } else {                           // inter
+      if (!has_second_ref(edge_mbmi))  // single
+        pred_context = 4 * (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]));
+      else  // comp
+        pred_context = 2;
     }
   } else {  // no edges available
     pred_context = 2;
@@ -713,8 +799,8 @@
 // For the bit to signal whether the single reference is ALTREF_FRAME or
 // BWDREF_FRAME, knowing that it shall be either of these 2 choices.
 //
-// NOTE: The probability of ref_frame[0] is ALTREF_FRAME, conditioning on it is
-// either ALTREF_FRAME or BWDREF_FRAME.
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF_FRAME, conditioning
+// on it is either ALTREF_FRAME/BWDREF_FRAME.
 int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -734,12 +820,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
-        if (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]))
+      if (!has_second_ref(edge_mbmi)) {  // single
+        if (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
                             edge_mbmi->ref_frame[1] == BWDREF_FRAME);
@@ -752,14 +838,14 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context =
               3 * (above0 == BWDREF_FRAME || above1 == BWDREF_FRAME ||
                    left0 == BWDREF_FRAME || left1 == BWDREF_FRAME);
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -770,13 +856,13 @@
           pred_context = (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
         else
           pred_context = 1 + 2 * (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
-      } else {
-        if (!CHECK_BWDREF_OR_ALTREF(above0) && !CHECK_BWDREF_OR_ALTREF(left0)) {
+      } else {  // single/single
+        if (!CHECK_BACKWARD_REFS(above0) && !CHECK_BACKWARD_REFS(left0)) {
           pred_context = 2 + (above0 == left0);
-        } else if (!CHECK_BWDREF_OR_ALTREF(above0) ||
-                   !CHECK_BWDREF_OR_ALTREF(left0)) {
+        } else if (!CHECK_BACKWARD_REFS(above0) ||
+                   !CHECK_BACKWARD_REFS(left0)) {
           const MV_REFERENCE_FRAME edge0 =
-              !CHECK_BWDREF_OR_ALTREF(above0) ? left0 : above0;
+              !CHECK_BACKWARD_REFS(above0) ? left0 : above0;
           pred_context = 4 * (edge0 == BWDREF_FRAME);
         } else {
           pred_context =
@@ -788,12 +874,12 @@
     const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
 
     if (!is_inter_block(edge_mbmi) ||
-        (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+        (!CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
-    else
+    else  // comp
       pred_context = 3 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
                           edge_mbmi->ref_frame[1] == BWDREF_FRAME);
   } else {  // no edges available (2)
@@ -807,8 +893,8 @@
 // For the bit to signal whether the single reference is LAST3/GOLDEN or
 // LAST2/LAST, knowing that it shall be either of these 2 choices.
 //
-// NOTE: The probability of ref_frame[0] is LAST3/GOLDEN, conditioning on it is
-// either LAST3/GOLDEN/LAST2/LAST.
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST3/GOLDEN, conditioning
+// on it is either LAST3/GOLDEN/LAST2/LAST.
 int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -828,12 +914,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
-        if (CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]))
+      if (!has_second_ref(edge_mbmi)) {  // single
+        if (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
                             CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
@@ -846,14 +932,14 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context =
               3 * (CHECK_LAST_OR_LAST2(above0) || CHECK_LAST_OR_LAST2(above1) ||
                    CHECK_LAST_OR_LAST2(left0) || CHECK_LAST_OR_LAST2(left1));
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -867,13 +953,12 @@
         else
           pred_context =
               1 + 2 * (CHECK_LAST_OR_LAST2(crf1) || CHECK_LAST_OR_LAST2(crf2));
-      } else {
-        if (CHECK_BWDREF_OR_ALTREF(above0) && CHECK_BWDREF_OR_ALTREF(left0)) {
+      } else {  // single/single
+        if (CHECK_BACKWARD_REFS(above0) && CHECK_BACKWARD_REFS(left0)) {
           pred_context = 2 + (above0 == left0);
-        } else if (CHECK_BWDREF_OR_ALTREF(above0) ||
-                   CHECK_BWDREF_OR_ALTREF(left0)) {
+        } else if (CHECK_BACKWARD_REFS(above0) || CHECK_BACKWARD_REFS(left0)) {
           const MV_REFERENCE_FRAME edge0 =
-              CHECK_BWDREF_OR_ALTREF(above0) ? left0 : above0;
+              CHECK_BACKWARD_REFS(above0) ? left0 : above0;
           pred_context = 4 * CHECK_LAST_OR_LAST2(edge0);
         } else {
           pred_context =
@@ -885,12 +970,12 @@
     const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
 
     if (!is_inter_block(edge_mbmi) ||
-        (CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+        (CHECK_BACKWARD_REFS(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
-    else
+    else  // comp
       pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
                           CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
   } else {  // no edges available (2)
@@ -904,8 +989,8 @@
 // For the bit to signal whether the single reference is LAST2_FRAME or
 // LAST_FRAME, knowing that it shall be either of these 2 choices.
 //
-// NOTE: The probability of ref_frame[0] is LAST2_FRAME, conditioning on it is
-// either LAST2_FRAME or LAST_FRAME.
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST2_FRAME, conditioning
+// on it is either LAST2_FRAME/LAST_FRAME.
 int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -925,12 +1010,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
+      if (!has_second_ref(edge_mbmi)) {  // single
         if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                             edge_mbmi->ref_frame[1] == LAST_FRAME);
@@ -943,13 +1028,13 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
                               left0 == LAST_FRAME || left1 == LAST_FRAME);
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -960,7 +1045,7 @@
           pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
         else
           pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
-      } else {
+      } else {  // single/single
         if (!CHECK_LAST_OR_LAST2(above0) && !CHECK_LAST_OR_LAST2(left0)) {
           pred_context = 2 + (above0 == left0);
         } else if (!CHECK_LAST_OR_LAST2(above0) ||
@@ -980,9 +1065,9 @@
         (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
-    else
+    else  // comp
       pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
                           edge_mbmi->ref_frame[1] == LAST_FRAME);
   } else {  // no edges available (2)
@@ -996,8 +1081,8 @@
 // For the bit to signal whether the single reference is GOLDEN_FRAME or
 // LAST3_FRAME, knowing that it shall be either of these 2 choices.
 //
-// NOTE: The probability of ref_frame[0] is GOLDEN_FRAME, conditioning on it is
-// either GOLDEN_FRAME or LAST3_FRAME.
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME, conditioning
+// on it is either GOLDEN_FRAME/LAST3_FRAME.
 int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -1017,12 +1102,12 @@
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
       const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
-      if (!has_second_ref(edge_mbmi)) {
+      if (!has_second_ref(edge_mbmi)) {  // single
         if (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]))
           pred_context = 3;
         else
           pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-      } else {
+      } else {  // comp
         pred_context = 1 +
                        2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
                             edge_mbmi->ref_frame[1] == LAST3_FRAME);
@@ -1035,13 +1120,13 @@
       const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
       const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
 
-      if (above_has_second && left_has_second) {
+      if (above_has_second && left_has_second) {  // comp/comp
         if (above0 == left0 && above1 == left1)
           pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
                               left0 == LAST3_FRAME || left1 == LAST3_FRAME);
         else
           pred_context = 2;
-      } else if (above_has_second || left_has_second) {
+      } else if (above_has_second || left_has_second) {  // single/comp
         const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
         const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
@@ -1052,7 +1137,7 @@
           pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
         else
           pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
-      } else {
+      } else {  // single/single
         if (!CHECK_GOLDEN_OR_LAST3(above0) && !CHECK_GOLDEN_OR_LAST3(left0)) {
           pred_context = 2 + (above0 == left0);
         } else if (!CHECK_GOLDEN_OR_LAST3(above0) ||
@@ -1073,9 +1158,9 @@
         (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]) &&
          !has_second_ref(edge_mbmi)))
       pred_context = 2;
-    else if (!has_second_ref(edge_mbmi))
+    else if (!has_second_ref(edge_mbmi))  // single
       pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
-    else
+    else  // comp
       pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
                           edge_mbmi->ref_frame[1] == LAST3_FRAME);
   } else {  // no edges available (2)
@@ -1096,8 +1181,8 @@
   const int has_left = xd->left_available;
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   if (has_above && has_left) {  // both edges available
     const int above_intra = !is_inter_block(above_mbmi);
     const int left_intra = !is_inter_block(left_mbmi);
@@ -1163,8 +1248,8 @@
 
   // Note:
   // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
   if (has_above && has_left) {  // both edges available
     const int above_intra = !is_inter_block(above_mbmi);
     const int left_intra = !is_inter_block(left_mbmi);
@@ -1205,7 +1290,7 @@
 
         if (rfs == GOLDEN_FRAME)
           pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-        else if (rfs == ALTREF_FRAME)
+        else if (rfs != GOLDEN_FRAME && rfs != LAST_FRAME)
           pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
         else
           pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);

diff --git a/av1/common/pred_common.h b/av1/common/pred_common.h
index 73d691b..6b47ed2 100644
--- a/av1/common/pred_common.h
+++ b/av1/common/pred_common.h

@@ -67,7 +67,15 @@
   return cm->fc->skip_probs[av1_get_skip_context(xd)];
 }
 
+#if CONFIG_DUAL_FILTER
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
+#else
 int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+#endif
+
+#if CONFIG_EXT_INTRA
+int av1_get_pred_context_intra_interp(const MACROBLOCKD *xd);
+#endif  // CONFIG_EXT_INTRA
 
 int av1_get_intra_inter_context(const MACROBLOCKD *xd);
 
@@ -83,32 +91,32 @@
   return cm->fc->comp_inter_prob[av1_get_reference_mode_context(cm, xd)];
 }
 
+int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
+                                    const MACROBLOCKD *xd);
+
+static INLINE aom_prob av1_get_pred_prob_comp_ref_p(const AV1_COMMON *cm,
+                                                    const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][0];
+}
+
 #if CONFIG_EXT_REFS
-int av1_get_pred_context_comp_fwdref_p(const AV1_COMMON *cm,
-                                       const MACROBLOCKD *xd);
+int av1_get_pred_context_comp_ref_p1(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_comp_fwdref_p(const AV1_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_fwdref_p(cm, xd);
-  return cm->fc->comp_fwdref_prob[pred_context][0];
+static INLINE aom_prob av1_get_pred_prob_comp_ref_p1(const AV1_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p1(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][1];
 }
 
-int av1_get_pred_context_comp_fwdref_p1(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd);
+int av1_get_pred_context_comp_ref_p2(const AV1_COMMON *cm,
+                                     const MACROBLOCKD *xd);
 
-static INLINE aom_prob av1_get_pred_prob_comp_fwdref_p1(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_fwdref_p1(cm, xd);
-  return cm->fc->comp_fwdref_prob[pred_context][1];
-}
-
-int av1_get_pred_context_comp_fwdref_p2(const AV1_COMMON *cm,
-                                        const MACROBLOCKD *xd);
-
-static INLINE aom_prob av1_get_pred_prob_comp_fwdref_p2(const AV1_COMMON *cm,
-                                                        const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_fwdref_p2(cm, xd);
-  return cm->fc->comp_fwdref_prob[pred_context][2];
+static INLINE aom_prob av1_get_pred_prob_comp_ref_p2(const AV1_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = av1_get_pred_context_comp_ref_p2(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][2];
 }
 
 int av1_get_pred_context_comp_bwdref_p(const AV1_COMMON *cm,
@@ -119,17 +127,6 @@
   const int pred_context = av1_get_pred_context_comp_bwdref_p(cm, xd);
   return cm->fc->comp_bwdref_prob[pred_context][0];
 }
-
-#else
-
-int av1_get_pred_context_comp_ref_p(const AV1_COMMON *cm,
-                                    const MACROBLOCKD *xd);
-
-static INLINE aom_prob av1_get_pred_prob_comp_ref_p(const AV1_COMMON *cm,
-                                                    const MACROBLOCKD *xd) {
-  const int pred_context = av1_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context];
-}
 #endif  // CONFIG_EXT_REFS
 
 int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
@@ -179,14 +176,16 @@
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
   const int has_above = xd->up_available;
   const int has_left = xd->left_available;
-  int above_ctx =
-      (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size : max_tx_size;
-  int left_ctx =
-      (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size : max_tx_size;
+  int above_ctx = (has_above && !above_mbmi->skip)
+                      ? (int)txsize_sqr_map[above_mbmi->tx_size]
+                      : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip)
+                     ? (int)txsize_sqr_map[left_mbmi->tx_size]
+                     : max_tx_size;
+  assert(xd->mi[0]->mbmi.sb_type >= BLOCK_8X8);
   if (!has_left) left_ctx = above_ctx;
 
   if (!has_above) above_ctx = left_ctx;
-
 #if CONFIG_CB4X4
   // TODO(jingning): Temporary setup. Will rework this after the cb4x4
   // framework is up running.
@@ -196,31 +195,59 @@
 #endif
 }
 
-static INLINE const aom_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
-                                           const struct tx_probs *tx_probs) {
-  switch (max_tx_size) {
-    case TX_8X8: return tx_probs->p8x8[ctx];
-    case TX_16X16: return tx_probs->p16x16[ctx];
-    case TX_32X32: return tx_probs->p32x32[ctx];
-    default: assert(0 && "Invalid max_tx_size."); return NULL;
+#if CONFIG_VAR_TX
+static void update_tx_counts(AV1_COMMON *cm, MACROBLOCKD *xd,
+                             MB_MODE_INFO *mbmi, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int blk_row, int blk_col,
+                             TX_SIZE max_tx_size, int ctx) {
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][tx_size];
+    mbmi->tx_size = tx_size;
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+      update_tx_counts(cm, xd, mbmi, plane_bsize, (TX_SIZE)(tx_size - 1),
+                       offsetr, offsetc, max_tx_size, ctx);
+    }
   }
 }
 
-static INLINE const aom_prob *get_tx_probs2(TX_SIZE max_tx_size,
-                                            const MACROBLOCKD *xd,
-                                            const struct tx_probs *tx_probs) {
-  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
-}
+static INLINE void inter_block_tx_count_update(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                               MB_MODE_INFO *mbmi,
+                                               BLOCK_SIZE plane_bsize,
+                                               int ctx) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+  int bh = num_4x4_blocks_wide_lookup[txb_size];
+  int idx, idy;
 
-static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
-                                          struct tx_counts *tx_counts) {
-  switch (max_tx_size) {
-    case TX_8X8: return tx_counts->p8x8[ctx];
-    case TX_16X16: return tx_counts->p16x16[ctx];
-    case TX_32X32: return tx_counts->p32x32[ctx];
-    default: assert(0 && "Invalid max_tx_size."); return NULL;
-  }
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      update_tx_counts(cm, xd, mbmi, plane_bsize, max_tx_size, idy, idx,
+                       max_tx_size, ctx);
 }
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/common/quant_common.c b/av1/common/quant_common.c
index 789ac78..69d0cc0 100644
--- a/av1/common/quant_common.c
+++ b/av1/common/quant_common.c

@@ -11,13 +11,114 @@
 
 #include "av1/common/common.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/entropy.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/seg_common.h"
+#include "av1/common/blockd.h"
 
-#if CONFIG_AOM_QM
-static void make_qmatrices(qm_val_t *wmatrix[NUM_QM_LEVELS][2][2][TX_SIZES],
-                           qm_val_t *iwmatrix[NUM_QM_LEVELS][2][2][TX_SIZES]);
-#endif
+#if CONFIG_NEW_QUANT
+// Bin widths expressed as a fraction over 128 of the quant stepsize,
+// for the quantization bins 0-4.
+// So a value x indicates the bin is actually factor x/128 of the
+// nominal quantization step.  For the zero bin, the width is only
+// for one side of zero, so the actual width is twice that.
+//
+// Functions with nuq correspond to "non uniform quantization"
+// TODO(sarahparker, debargha): Optimize these tables
+
+typedef struct {
+  uint8_t knots[NUQ_KNOTS];  // offsets
+  uint8_t doff;              // dequantization
+} qprofile_type;
+
+static const qprofile_type nuq[QUANT_PROFILES][COEF_BANDS] = {
+  {
+      // lossless
+      { { 64, 128, 128 }, 0 },  // dc, band 0
+      { { 64, 128, 128 }, 0 },  // band 1
+      { { 64, 128, 128 }, 0 },  // band 2
+      { { 64, 128, 128 }, 0 },  // band 3
+      { { 64, 128, 128 }, 0 },  // band 4
+      { { 64, 128, 128 }, 0 },  // band 5
+  },
+  {
+      { { 64, 128, 128 }, 4 },   // dc, band 0
+      { { 64, 128, 128 }, 6 },   // band 1
+      { { 64, 128, 128 }, 8 },   // band 2
+      { { 64, 128, 128 }, 10 },  // band 3
+      { { 72, 128, 128 }, 12 },  // band 4
+      { { 80, 128, 128 }, 14 }   // band 5
+  },
+  {
+      { { 64, 128, 128 }, 6 },   // dc, band 0
+      { { 64, 128, 128 }, 8 },   // band 1
+      { { 64, 128, 128 }, 10 },  // band 2
+      { { 64, 128, 128 }, 12 },  // band 3
+      { { 72, 128, 128 }, 14 },  // band 4
+      { { 80, 128, 128 }, 16 }   // band 5
+  },
+  {
+      { { 64, 128, 128 }, 8 },   // dc, band 0
+      { { 64, 128, 128 }, 10 },  // band 1
+      { { 64, 128, 128 }, 12 },  // band 2
+      { { 72, 128, 128 }, 14 },  // band 3
+      { { 76, 128, 128 }, 16 },  // band 4
+      { { 80, 128, 128 }, 18 }   // band 5
+  }
+};
+
+static const uint8_t *get_nuq_knots(int band, int q_profile) {
+  return nuq[q_profile][band].knots;
+}
+
+static INLINE int16_t quant_to_doff_fixed(int band, int q_profile) {
+  return nuq[q_profile][band].doff;
+}
+
+// get cumulative bins
+static INLINE void get_cuml_bins_nuq(int q, int band, tran_low_t *cuml_bins,
+                                     int q_profile) {
+  const uint8_t *knots = get_nuq_knots(band, q_profile);
+  int16_t cuml_knots[NUQ_KNOTS];
+  int i;
+  cuml_knots[0] = knots[0];
+  for (i = 1; i < NUQ_KNOTS; ++i) cuml_knots[i] = cuml_knots[i - 1] + knots[i];
+  for (i = 0; i < NUQ_KNOTS; ++i)
+    cuml_bins[i] = ROUND_POWER_OF_TWO(cuml_knots[i] * q, 7);
+}
+
+void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
+                             tran_low_t *cuml_bins, int q_profile) {
+  const uint8_t *knots = get_nuq_knots(band, q_profile);
+  tran_low_t cuml_bins_[NUQ_KNOTS], *cuml_bins_ptr;
+  tran_low_t doff;
+  int i;
+  cuml_bins_ptr = (cuml_bins ? cuml_bins : cuml_bins_);
+  get_cuml_bins_nuq(q, band, cuml_bins_ptr, q_profile);
+  dq[0] = 0;
+  for (i = 1; i < NUQ_KNOTS; ++i) {
+    doff = quant_to_doff_fixed(band, q_profile);
+    doff = ROUND_POWER_OF_TWO(doff * knots[i], 7);
+    dq[i] =
+        cuml_bins_ptr[i - 1] + ROUND_POWER_OF_TWO((knots[i] - doff * 2) * q, 8);
+  }
+  doff = quant_to_doff_fixed(band, q_profile);
+  dq[NUQ_KNOTS] =
+      cuml_bins_ptr[NUQ_KNOTS - 1] + ROUND_POWER_OF_TWO((64 - doff) * q, 7);
+}
+
+tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq) {
+  if (v <= NUQ_KNOTS)
+    return dq[v];
+  else
+    return dq[NUQ_KNOTS] + (v - NUQ_KNOTS) * q;
+}
+
+tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq) {
+  tran_low_t dqmag = av1_dequant_abscoeff_nuq(abs(v), q, dq);
+  return (v < 0 ? -dqmag : dqmag);
+}
+#endif  // CONFIG_NEW_QUANT
 
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
   4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,

diff --git a/av1/common/quant_common.h b/av1/common/quant_common.h
index e0238a9..43833c6 100644
--- a/av1/common/quant_common.h
+++ b/av1/common/quant_common.h

@@ -15,6 +15,7 @@
 #include "aom/aom_codec.h"
 #include "av1/common/seg_common.h"
 #include "av1/common/enums.h"
+#include "av1/common/entropy.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -45,7 +46,7 @@
 #if CONFIG_AOM_QM
 // Reduce the large number of quantizers to a smaller number of levels for which
 // different matrices may be defined
-static inline int aom_get_qmlevel(int qindex, int first, int last) {
+static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
   int qmlevel = (qindex * (last + 1 - first) + QINDEX_RANGE / 2) / QINDEX_RANGE;
   qmlevel = AOMMIN(qmlevel + first, NUM_QM_LEVELS - 1);
   return qmlevel;
@@ -57,6 +58,46 @@
                       int log2sizem2, int is_intra);
 #endif
 
+#if CONFIG_NEW_QUANT
+
+#define QUANT_PROFILES 4
+#define QUANT_RANGES 2
+#define NUQ_KNOTS 3
+
+typedef tran_low_t dequant_val_type_nuq[NUQ_KNOTS + 1];
+typedef tran_low_t cuml_bins_type_nuq[NUQ_KNOTS];
+void av1_get_dequant_val_nuq(int q, int band, tran_low_t *dq,
+                             tran_low_t *cuml_bins, int dq_off_index);
+tran_low_t av1_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq);
+tran_low_t av1_dequant_coeff_nuq(int v, int q, const tran_low_t *dq);
+
+static INLINE int qindex_to_qrange(int qindex) {
+  return (qindex < 140 ? 1 : 0);
+}
+
+static INLINE int get_dq_profile_from_ctx(int qindex, int q_ctx, int is_inter,
+                                          PLANE_TYPE plane_type) {
+  // intra/inter, Y/UV, ctx, qrange
+  static const int
+      def_dq_profile_lookup[REF_TYPES][PLANE_TYPES][COEFF_CONTEXTS0]
+                           [QUANT_RANGES] = {
+                             {
+                                 // intra
+                                 { { 2, 1 }, { 2, 1 }, { 2, 1 } },  // Y
+                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
+                             },
+                             {
+                                 // inter
+                                 { { 3, 1 }, { 2, 1 }, { 2, 1 } },  // Y
+                                 { { 3, 1 }, { 3, 1 }, { 3, 1 } },  // UV
+                             },
+                           };
+  if (!qindex) return 0;  // lossless
+  return def_dq_profile_lookup[is_inter][plane_type][q_ctx]
+                              [qindex_to_qrange(qindex)];
+}
+#endif  // CONFIG_NEW_QUANT
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/reconinter.c b/av1/common/reconinter.c
index 2c67dc2..3837809 100644
--- a/av1/common/reconinter.c
+++ b/av1/common/reconinter.c

@@ -12,21 +12,489 @@
 #include <assert.h>
 
 #include "./aom_scale_rtcd.h"
-#include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
+#include "./aom_config.h"
 
 #include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
 
 #include "av1/common/blockd.h"
 #include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
+#if CONFIG_MOTION_VAR
+#include "av1/common/onyxc_int.h"
+#endif  // CONFIG_MOTION_VAR
+#if CONFIG_GLOBAL_MOTION
+#include "av1/common/warped_motion.h"
+#endif  // CONFIG_GLOBAL_MOTION
+
+#if CONFIG_EXT_INTER
+
+#define NSMOOTHERS 1
+static int get_masked_weight(int m, int smoothness) {
+#define SMOOTHER_LEN 32
+  static const uint8_t smoothfn[NSMOOTHERS][2 * SMOOTHER_LEN + 1] = { {
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  2,  4,  7,  13, 21, 32, 43,
+      51, 57, 60, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+  } };
+  if (m < -SMOOTHER_LEN)
+    return 0;
+  else if (m > SMOOTHER_LEN)
+    return (1 << WEDGE_WEIGHT_BITS);
+  else
+    return smoothfn[smoothness][m + SMOOTHER_LEN];
+}
+
+// [smoother][negative][direction]
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_obl[NSMOOTHERS][2][WEDGE_DIRECTIONS]
+                              [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_signflip_lookup[BLOCK_SIZES][MAX_WEDGE_TYPES]);
+
+// 3 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
+DECLARE_ALIGNED(16, static uint8_t,
+                wedge_mask_buf[2 * MAX_WEDGE_TYPES * 3 * MAX_WEDGE_SQUARE]);
+
+static wedge_masks_type wedge_masks[BLOCK_SIZES][2];
+
+// Some unused wedge codebooks left temporarily to facilitate experiments.
+// To be removed when setteld.
+static wedge_code_type wedge_codebook_8_hgtw[8] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+};
+
+static wedge_code_type wedge_codebook_8_hltw[8] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static wedge_code_type wedge_codebook_8_heqw[8] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+};
+
+#if !USE_LARGE_WEDGE_CODEBOOK
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+  { WEDGE_OBLIQUE27, 4, 2 },  { WEDGE_OBLIQUE27, 4, 6 },
+  { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+  { WEDGE_OBLIQUE63, 2, 4 },  { WEDGE_OBLIQUE63, 6, 4 },
+  { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[3], 0, wedge_masks[3] },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[4], 0, wedge_masks[4] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[5], 0, wedge_masks[5] },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[6], 0, wedge_masks[6] },
+  { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[7], 0, wedge_masks[7] },
+  { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[8], 0, wedge_masks[8] },
+  { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[9], 0, wedge_masks[9] },
+  { 0, wedge_codebook_8_hgtw, wedge_signflip_lookup[10], 0, wedge_masks[10] },
+  { 0, wedge_codebook_8_hltw, wedge_signflip_lookup[11], 0, wedge_masks[11] },
+  { 0, wedge_codebook_8_heqw, wedge_signflip_lookup[12], 0, wedge_masks[12] },
+#if CONFIG_EXT_PARTITION
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+#endif  // CONFIG_EXT_PARTITION
+};
+
+#else
+
+static const wedge_code_type wedge_codebook_32_hgtw[32] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
+  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
+  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
+  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
+  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
+  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
+  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
+  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
+  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
+  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
+  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
+  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
+};
+
+static const wedge_code_type wedge_codebook_32_hltw[32] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 4, 4 },
+  { WEDGE_VERTICAL, 6, 4 },   { WEDGE_HORIZONTAL, 4, 4 },
+  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
+  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
+  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
+  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
+  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
+  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
+  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
+  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
+  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
+  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
+  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
+  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
+};
+
+static const wedge_code_type wedge_codebook_32_heqw[32] = {
+  { WEDGE_OBLIQUE27, 4, 4 },  { WEDGE_OBLIQUE63, 4, 4 },
+  { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+  { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+  { WEDGE_VERTICAL, 2, 4 },   { WEDGE_VERTICAL, 6, 4 },
+  { WEDGE_OBLIQUE27, 4, 1 },  { WEDGE_OBLIQUE27, 4, 2 },
+  { WEDGE_OBLIQUE27, 4, 3 },  { WEDGE_OBLIQUE27, 4, 5 },
+  { WEDGE_OBLIQUE27, 4, 6 },  { WEDGE_OBLIQUE27, 4, 7 },
+  { WEDGE_OBLIQUE153, 4, 1 }, { WEDGE_OBLIQUE153, 4, 2 },
+  { WEDGE_OBLIQUE153, 4, 3 }, { WEDGE_OBLIQUE153, 4, 5 },
+  { WEDGE_OBLIQUE153, 4, 6 }, { WEDGE_OBLIQUE153, 4, 7 },
+  { WEDGE_OBLIQUE63, 1, 4 },  { WEDGE_OBLIQUE63, 2, 4 },
+  { WEDGE_OBLIQUE63, 3, 4 },  { WEDGE_OBLIQUE63, 5, 4 },
+  { WEDGE_OBLIQUE63, 6, 4 },  { WEDGE_OBLIQUE63, 7, 4 },
+  { WEDGE_OBLIQUE117, 1, 4 }, { WEDGE_OBLIQUE117, 2, 4 },
+  { WEDGE_OBLIQUE117, 3, 4 }, { WEDGE_OBLIQUE117, 5, 4 },
+  { WEDGE_OBLIQUE117, 6, 4 }, { WEDGE_OBLIQUE117, 7, 4 },
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 5, wedge_codebook_32_heqw, wedge_signflip_lookup[3], 0, wedge_masks[3] },
+  { 5, wedge_codebook_32_hgtw, wedge_signflip_lookup[4], 0, wedge_masks[4] },
+  { 5, wedge_codebook_32_hltw, wedge_signflip_lookup[5], 0, wedge_masks[5] },
+  { 5, wedge_codebook_32_heqw, wedge_signflip_lookup[6], 0, wedge_masks[6] },
+  { 5, wedge_codebook_32_hgtw, wedge_signflip_lookup[7], 0, wedge_masks[7] },
+  { 5, wedge_codebook_32_hltw, wedge_signflip_lookup[8], 0, wedge_masks[8] },
+  { 5, wedge_codebook_32_heqw, wedge_signflip_lookup[9], 0, wedge_masks[9] },
+  { 0, wedge_codebook_8_hgtw, wedge_signflip_lookup[10], 0, wedge_masks[10] },
+  { 0, wedge_codebook_8_hltw, wedge_signflip_lookup[11], 0, wedge_masks[11] },
+  { 0, wedge_codebook_8_heqw, wedge_signflip_lookup[12], 0, wedge_masks[12] },
+#if CONFIG_EXT_PARTITION
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+  { 0, NULL, NULL, 0, NULL },
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // USE_LARGE_WEDGE_CODEBOOK
+
+static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
+                                             BLOCK_SIZE sb_type) {
+  const uint8_t *master;
+  const int bh = 4 << b_height_log2_lookup[sb_type];
+  const int bw = 4 << b_width_log2_lookup[sb_type];
+  const wedge_code_type *a =
+      wedge_params_lookup[sb_type].codebook + wedge_index;
+  const int smoother = wedge_params_lookup[sb_type].smoother;
+  int woff, hoff;
+  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+
+  assert(wedge_index >= 0 &&
+         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+  woff = (a->x_offset * bw) >> 3;
+  hoff = (a->y_offset * bh) >> 3;
+  master = wedge_mask_obl[smoother][neg ^ wsignflip][a->direction] +
+           MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+           MASK_MASTER_SIZE / 2 - woff;
+  return master;
+}
+
+const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
+                                 BLOCK_SIZE sb_type, int offset_x,
+                                 int offset_y) {
+  const uint8_t *mask =
+      get_wedge_mask_inplace(wedge_index, wedge_sign, sb_type);
+  if (mask) mask -= (offset_x + offset_y * MASK_MASTER_STRIDE);
+  return mask;
+}
+
+static void init_wedge_master_masks() {
+  int i, j, s;
+  const int w = MASK_MASTER_SIZE;
+  const int h = MASK_MASTER_SIZE;
+  const int stride = MASK_MASTER_STRIDE;
+  const int a[2] = { 2, 1 };
+  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+  for (s = 0; s < NSMOOTHERS; s++) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        int x = (2 * j + 1 - w);
+        int y = (2 * i + 1 - h);
+        int m = (int)rint((a[0] * x + a[1] * y) / asqrt);
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j] =
+            wedge_mask_obl[s][1][WEDGE_OBLIQUE27][j * stride + i] =
+                get_masked_weight(m, s);
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+            wedge_mask_obl[s][1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+                (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j] =
+            wedge_mask_obl[s][0][WEDGE_OBLIQUE27][j * stride + i] =
+                (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+            wedge_mask_obl[s][0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+                get_masked_weight(m, s);
+        wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j] =
+            wedge_mask_obl[s][1][WEDGE_HORIZONTAL][j * stride + i] =
+                get_masked_weight(x, s);
+        wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j] =
+            wedge_mask_obl[s][0][WEDGE_HORIZONTAL][j * stride + i] =
+                (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(x, s);
+      }
+  }
+}
+
+// If the signs for the wedges for various blocksizes are
+// inconsistent flip the sign flag. Do it only once for every
+// wedge codebook.
+static void init_wedge_signs() {
+  BLOCK_SIZE sb_type;
+  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
+  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES; ++sb_type) {
+    const int bw = 4 * num_4x4_blocks_wide_lookup[sb_type];
+    const int bh = 4 * num_4x4_blocks_high_lookup[sb_type];
+    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
+    const int wbits = wedge_params.bits;
+    const int wtypes = 1 << wbits;
+    int i, w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+      int sum = 0;
+      for (i = 0; i < bw; ++i) sum += mask[i];
+      for (i = 0; i < bh; ++i) sum += mask[i * MASK_MASTER_STRIDE];
+      sum = (sum + (bw + bh) / 2) / (bw + bh);
+      wedge_params.signflip[w] = (sum < 32);
+    }
+  }
+}
+
+static void init_wedge_masks() {
+  uint8_t *dst = wedge_mask_buf;
+  BLOCK_SIZE bsize;
+  memset(wedge_masks, 0, sizeof(wedge_masks));
+  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES; ++bsize) {
+    const uint8_t *mask;
+    const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+    const int bh = 4 * num_4x4_blocks_high_lookup[bsize];
+    const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
+    const int wbits = wedge_params->bits;
+    const int wtypes = 1 << wbits;
+    int w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      mask = get_wedge_mask_inplace(w, 0, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[0][w] = dst;
+      dst += bw * bh;
+
+      mask = get_wedge_mask_inplace(w, 1, bsize);
+      aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+                        bh);
+      wedge_params->masks[1][w] = dst;
+      dst += bw * bh;
+    }
+    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
+  }
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+void av1_init_wedge_masks() {
+  init_wedge_master_masks();
+  init_wedge_signs();
+  init_wedge_masks();
+}
+
+#if CONFIG_SUPERTX
+static void build_masked_compound_wedge_extend(
+    uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride, int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type, int wedge_offset_x, int wedge_offset_y, int h, int w) {
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = av1_get_soft_mask(wedge_index, wedge_sign, sb_type,
+                                          wedge_offset_x, wedge_offset_y);
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, MASK_MASTER_STRIDE, h, w, subh, subw);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static void build_masked_compound_wedge_extend_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride, int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type, int wedge_offset_x, int wedge_offset_y, int h, int w,
+    int bd) {
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = av1_get_soft_mask(wedge_index, wedge_sign, sb_type,
+                                          wedge_offset_x, wedge_offset_y);
+  aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                            src1_stride, mask, MASK_MASTER_STRIDE, h, w, subh,
+                            subw, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_SUPERTX
+
+static void build_masked_compound_wedge(uint8_t *dst, int dst_stride,
+                                        const uint8_t *src0, int src0_stride,
+                                        const uint8_t *src1, int src1_stride,
+                                        int wedge_index, int wedge_sign,
+                                        BLOCK_SIZE sb_type, int h, int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask =
+      av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+  aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                     mask, 4 * num_4x4_blocks_wide_lookup[sb_type], h, w, subh,
+                     subw);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static void build_masked_compound_wedge_highbd(
+    uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride, int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type, int h, int w, int bd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask =
+      av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+  aom_highbd_blend_a64_mask(
+      dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask,
+      4 * num_4x4_blocks_wide_lookup[sb_type], h, w, subh, subw, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     const int subpel_x, const int subpel_y,
+                                     const struct scale_factors *sf, int w,
+                                     int h,
+#if CONFIG_DUAL_FILTER
+                                     const InterpFilter *interp_filter,
+#else
+                                     const InterpFilter interp_filter,
+#endif
+                                     int xs, int ys,
+#if CONFIG_SUPERTX
+                                     int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                                     const MACROBLOCKD *xd) {
+  const MODE_INFO *mi = xd->mi[0];
+// The prediction filter types used here should be those for
+// the second reference block.
+#if CONFIG_DUAL_FILTER
+  InterpFilter tmp_ipf[4] = {
+    interp_filter[2], interp_filter[3], interp_filter[2], interp_filter[3],
+  };
+#else
+  InterpFilter tmp_ipf = interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+#if CONFIG_AOM_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_dst = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+                         ? CONVERT_TO_BYTEPTR(tmp_dst_)
+                         : tmp_dst_;
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
+                           subpel_y, sf, w, h, 0, tmp_ipf, xs, ys, xd);
+#if CONFIG_SUPERTX
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    build_masked_compound_wedge_extend_highbd(
+        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+        mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign,
+        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w, xd->bd);
+  else
+    build_masked_compound_wedge_extend(
+        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+        mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign,
+        mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w);
+#else
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    build_masked_compound_wedge_highbd(
+        dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+        mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign,
+        mi->mbmi.sb_type, h, w, xd->bd);
+  else
+    build_masked_compound_wedge(dst, dst_stride, dst, dst_stride, tmp_dst,
+                                MAX_SB_SIZE, mi->mbmi.interinter_wedge_index,
+                                mi->mbmi.interinter_wedge_sign,
+                                mi->mbmi.sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+#else   // CONFIG_AOM_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
+  av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_x,
+                           subpel_y, sf, w, h, 0, tmp_ipf, xs, ys, xd);
+#if CONFIG_SUPERTX
+  build_masked_compound_wedge_extend(
+      dst, dst_stride, dst, dst_stride, tmp_dst, MAX_SB_SIZE,
+      mi->mbmi.interinter_wedge_index, mi->mbmi.interinter_wedge_sign,
+      mi->mbmi.sb_type, wedge_offset_x, wedge_offset_y, h, w);
+#else
+  build_masked_compound_wedge(dst, dst_stride, dst, dst_stride, tmp_dst,
+                              MAX_SB_SIZE, mi->mbmi.interinter_wedge_index,
+                              mi->mbmi.interinter_wedge_sign, mi->mbmi.sb_type,
+                              h, w);
+#endif  // CONFIG_SUPERTX
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+}
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_build_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
-    const InterpFilter *interp_filter, enum mv_precision precision, int x,
-    int y, int bd) {
+#if CONFIG_DUAL_FILTER
+    const InterpFilter *interp_filter,
+#else
+    const InterpFilter interp_filter,
+#endif
+    enum mv_precision precision, int x, int y, int bd) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
@@ -36,16 +504,21 @@
 
   src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
-  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, sf,
-                       w, h, ref, interp_filter, sf->x_step_q4, sf->y_step_q4,
-                       bd);
+  highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                         sf, w, h, ref, interp_filter, sf->x_step_q4,
+                         sf->y_step_q4, bd);
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
-                               int ref, const InterpFilter *interp_filter,
+                               int ref,
+#if CONFIG_DUAL_FILTER
+                               const InterpFilter *interp_filter,
+#else
+                               const InterpFilter interp_filter,
+#endif
                                enum mv_precision precision, int x, int y) {
   const int is_q4 = precision == MV_PRECISION_Q4;
   const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
@@ -65,7 +538,11 @@
                             int mi_col_offset, int mi_row_offset,
 #endif  // CONFIG_MOTION_VAR
                             int block, int bw, int bh, int x, int y, int w,
-                            int h, int mi_x, int mi_y) {
+                            int h,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
 #if CONFIG_MOTION_VAR
   const MODE_INFO *mi = xd->mi[mi_col_offset + xd->mi_stride * mi_row_offset];
@@ -75,7 +552,21 @@
 #endif  // CONFIG_MOTION_VAR
   const int is_compound = has_second_ref(&mi->mbmi);
   int ref;
+#if CONFIG_GLOBAL_MOTION
+  Global_Motion_Params *gm[2];
+  int is_global[2];
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    gm[ref] = &xd->global_motion[mi->mbmi.ref_frame[ref]];
+    is_global[ref] =
+        (get_y_mode(mi, block) == ZEROMV && get_gmtype(gm[ref]) > GLOBAL_ZERO);
+  }
+  // TODO(sarahparker) remove these once gm works with all experiments
+  (void)gm;
+  (void)is_global;
+#endif  // CONFIG_GLOBAL_MOTION
 
+// TODO(sarahparker) enable the use of DUAL_FILTER in warped motion functions
+// in order to allow GLOBAL_MOTION and DUAL_FILTER to work together
 #if CONFIG_SUB8X8_MC
 #if CONFIG_MOTION_VAR
   if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0 && !build_for_obmc) {
@@ -139,21 +630,21 @@
           pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
                  (scaled_mv.col >> SUBPEL_BITS);
 
-#if CONFIG_AOM_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                                 subpel_x, subpel_y, sf, x_step, y_step, ref,
-                                 &mi->mbmi.interp_filter, xs, ys, xd->bd);
-          } else {
-            inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                            subpel_x, subpel_y, sf, x_step, y_step, ref,
-                            &mi->mbmi.interp_filter, xs, ys);
-          }
-#else
-          inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x,
-                          subpel_y, sf, x_step, y_step, ref,
-                          &mi->mbmi.interp_filter, xs, ys);
-#endif
+#if CONFIG_EXT_INTER
+          if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
+              mi->mbmi.use_wedge_interinter)
+            av1_make_masked_inter_predictor(
+                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
+                sf, w, h, mi->mbmi.interp_filter, xs, ys,
+#if CONFIG_SUPERTX
+                wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                xd);
+          else
+#endif  // CONFIG_EXT_INTER
+            av1_make_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                                     subpel_x, subpel_y, sf, x_step, y_step,
+                                     ref, mi->mbmi.interp_filter, xs, ys, xd);
         }
       }
     }
@@ -172,7 +663,7 @@
                                         : average_split_mvs(pd, mi, ref, block))
 #else
                       ? average_split_mvs(pd, mi, ref, block)
-#endif  // CONFIG_SUB8X8_MC && CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR
                       : mi->mbmi.mv[ref].as_mv;
 
     // TODO(jkoleszar): This clamping is done in the incorrect place for the
@@ -199,24 +690,40 @@
       scaled_mv.col = mv_q4.col;
       xs = ys = 16;
     }
+
     subpel_x = scaled_mv.col & SUBPEL_MASK;
     subpel_y = scaled_mv.row & SUBPEL_MASK;
     pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
            (scaled_mv.col >> SUBPEL_BITS);
 
+#if CONFIG_EXT_INTER
+    if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
+        mi->mbmi.use_wedge_interinter)
+      av1_make_masked_inter_predictor(pre, pre_buf->stride, dst,
+                                      dst_buf->stride, subpel_x, subpel_y, sf,
+                                      w, h, mi->mbmi.interp_filter, xs, ys,
+#if CONFIG_SUPERTX
+                                      wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                                      xd);
+    else
+#else  // CONFIG_EXT_INTER
+#if CONFIG_GLOBAL_MOTION
+    if (is_global[ref])
+      av1_warp_plane(&(gm[ref]->motion_params),
 #if CONFIG_AOM_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x,
-                           subpel_y, sf, w, h, ref, &mi->mbmi.interp_filter, xs,
-                           ys, xd->bd);
-    } else {
-      inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x,
-                      subpel_y, sf, w, h, ref, &mi->mbmi.interp_filter, xs, ys);
-    }
-#else
-    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, &mi->mbmi.interp_filter, xs, ys);
+                     xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+                     pre_buf->buf0, pre_buf->width, pre_buf->height,
+                     pre_buf->stride, dst, (mi_x >> pd->subsampling_x) + x,
+                     (mi_y >> pd->subsampling_y) + y, w, h, dst_buf->stride,
+                     pd->subsampling_x, pd->subsampling_y, xs, ys, ref);
+    else
+#endif  // CONFIG_GLOBAL_MOTION
+#endif  // CONFIG_EXT_INTER
+      av1_make_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                               subpel_x, subpel_y, sf, w, h, ref,
+                               mi->mbmi.interp_filter, xs, ys, xd);
   }
 }
 
@@ -240,21 +747,21 @@
       av1_highbd_build_inter_predictor(
           pre, pd->pre[ref].stride, dst, pd->dst.stride,
           &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height,
-          ref, &mi->mbmi.interp_filter, MV_PRECISION_Q3,
+          ref, mi->mbmi.interp_filter, MV_PRECISION_Q3,
           mi_col * MI_SIZE + 4 * ic, mi_row * MI_SIZE + 4 * ir, xd->bd);
     } else {
       av1_build_inter_predictor(
           pre, pd->pre[ref].stride, dst, pd->dst.stride,
           &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height,
-          ref, &mi->mbmi.interp_filter, MV_PRECISION_Q3,
+          ref, mi->mbmi.interp_filter, MV_PRECISION_Q3,
           mi_col * MI_SIZE + 4 * ic, mi_row * MI_SIZE + 4 * ir);
     }
 #else
     av1_build_inter_predictor(
         pre, pd->pre[ref].stride, dst, pd->dst.stride,
         &mi->bmi[i].as_mv[ref].as_mv, &xd->block_refs[ref]->sf, width, height,
-        ref, &mi->mbmi.interp_filter, MV_PRECISION_Q3,
-        mi_col * MI_SIZE + 4 * ic, mi_row * MI_SIZE + 4 * ir);
+        ref, mi->mbmi.interp_filter, MV_PRECISION_Q3, mi_col * MI_SIZE + 4 * ic,
+        mi_row * MI_SIZE + 4 * ir);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
   }
 }
@@ -288,14 +795,21 @@
 #if CONFIG_MOTION_VAR
                                  0, 0,
 #endif  // CONFIG_MOTION_VAR
-                                 y * 2 + x, bw, bh, 4 * x, 4 * y, pw, ph, mi_x,
-                                 mi_y);
+                                 y * 2 + x, bw, bh, 4 * x, 4 * y, pw, ph,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                 0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                 mi_x, mi_y);
     } else {
       build_inter_predictors(xd, plane,
 #if CONFIG_MOTION_VAR
                              0, 0,
 #endif  // CONFIG_MOTION_VAR
-                             0, bw, bh, 0, 0, bw, bh, mi_x, mi_y);
+                             0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             mi_x, mi_y);
     }
   }
 }
@@ -303,23 +817,53 @@
 void av1_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    av1_build_interintra_predictors_sby(xd, xd->plane[0].dst.buf,
+                                        xd->plane[0].dst.stride, bsize);
+#endif  // CONFIG_EXT_INTER
 }
 
 void av1_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize, int plane) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+    if (plane == 0) {
+      av1_build_interintra_predictors_sby(xd, xd->plane[0].dst.buf,
+                                          xd->plane[0].dst.stride, bsize);
+    } else {
+      av1_build_interintra_predictors_sbc(xd, xd->plane[plane].dst.buf,
+                                          xd->plane[plane].dst.stride, plane,
+                                          bsize);
+    }
+  }
+#endif  // CONFIG_EXT_INTER
 }
 
 void av1_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    av1_build_interintra_predictors_sbuv(
+        xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf, xd->plane[1].dst.stride,
+        xd->plane[2].dst.stride, bsize);
+#endif  // CONFIG_EXT_INTER
 }
 
 void av1_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
                                     MAX_MB_PLANE - 1);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    av1_build_interintra_predictors(
+        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+        xd->plane[2].dst.stride, bsize);
+#endif  // CONFIG_EXT_INTER
 }
 
 void av1_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
@@ -327,14 +871,19 @@
                           int mi_col) {
   uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
                                            src->v_buffer };
+  const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
+                                     src->uv_crop_width };
+  const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
+                                      src->uv_crop_height };
   const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
                                       src->uv_stride };
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblockd_plane *const pd = &planes[i];
-    setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL,
-                     pd->subsampling_x, pd->subsampling_y);
+    setup_pred_plane(&pd->dst, buffers[i], widths[i], heights[i], strides[i],
+                     mi_row, mi_col, NULL, pd->subsampling_x,
+                     pd->subsampling_y);
   }
 }
 
@@ -345,16 +894,246 @@
     int i;
     uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
                                              src->v_buffer };
+    const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width,
+                                       src->uv_crop_width };
+    const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height,
+                                        src->uv_crop_height };
     const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
                                         src->uv_stride };
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
-      setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
-                       sf, pd->subsampling_x, pd->subsampling_y);
+      setup_pred_plane(&pd->pre[idx], buffers[i], widths[i], heights[i],
+                       strides[i], mi_row, mi_col, sf, pd->subsampling_x,
+                       pd->subsampling_y);
     }
   }
 }
 
+#if CONFIG_SUPERTX
+static const uint8_t mask_8[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
+
+static const uint8_t mask_16[16] = { 63, 62, 60, 58, 55, 50, 43, 36,
+                                     28, 21, 14, 9,  6,  4,  2,  1 };
+
+static const uint8_t mask_32[32] = { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63,
+                                     61, 57, 52, 45, 36, 28, 19, 12, 7,  3,  1,
+                                     0,  0,  0,  0,  0,  0,  0,  0,  0,  0 };
+
+static const uint8_t mask_8_uv[8] = { 64, 64, 62, 52, 12, 2, 0, 0 };
+
+static const uint8_t mask_16_uv[16] = { 64, 64, 64, 64, 61, 53, 45, 36,
+                                        28, 19, 11, 3,  0,  0,  0,  0 };
+
+static const uint8_t mask_32_uv[32] = { 64, 64, 64, 64, 64, 64, 64, 64,
+                                        64, 64, 64, 64, 60, 54, 46, 36,
+                                        28, 18, 10, 4,  0,  0,  0,  0,
+                                        0,  0,  0,  0,  0,  0,  0,  0 };
+
+static const uint8_t *get_supertx_mask(int length, int plane) {
+  switch (length) {
+    case 8: return plane ? mask_8_uv : mask_8;
+    case 16: return plane ? mask_16_uv : mask_16;
+    case 32: return plane ? mask_32_uv : mask_32;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+void av1_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
+    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
+    int plane) {
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const int ssx = pd->subsampling_x;
+  const int ssy = pd->subsampling_y;
+  const int top_w = (4 << b_width_log2_lookup[top_bsize]) >> ssx;
+  const int top_h = (4 << b_height_log2_lookup[top_bsize]) >> ssy;
+  const int w = (4 << b_width_log2_lookup[bsize]) >> ssx;
+  const int h = (4 << b_height_log2_lookup[bsize]) >> ssy;
+  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
+  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
+
+  int w_remain, h_remain;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  assert(bsize <= BLOCK_32X32);
+  assert(IMPLIES(plane == 0, ssx == 0));
+  assert(IMPLIES(plane == 0, ssy == 0));
+
+  switch (partition) {
+    case PARTITION_HORZ: {
+      const uint8_t *const mask = get_supertx_mask(h, ssy);
+
+      w_remain = top_w;
+      h_remain = top_h - h_offset - h;
+      dst += h_offset * dst_stride;
+      pre += h_offset * pre_stride;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (is_hdb)
+        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre,
+                                   pre_stride, mask, h, top_w, xd->bd);
+      else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
+                            mask, h, top_w);
+
+      dst += h * dst_stride;
+      pre += h * pre_stride;
+      break;
+    }
+    case PARTITION_VERT: {
+      const uint8_t *const mask = get_supertx_mask(w, ssx);
+
+      w_remain = top_w - w_offset - w;
+      h_remain = top_h;
+      dst += w_offset;
+      pre += w_offset;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (is_hdb)
+        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre,
+                                   pre_stride, mask, top_h, w, xd->bd);
+      else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, pre, pre_stride,
+                            mask, top_h, w);
+
+      dst += w;
+      pre += w;
+      break;
+    }
+    default: {
+      assert(0);
+      return;
+    }
+  }
+
+  if (w_remain == 0 || h_remain == 0) {
+    return;
+  }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (is_hdb) {
+    dst = (uint8_t *)CONVERT_TO_SHORTPTR(dst);
+    pre = (const uint8_t *)CONVERT_TO_SHORTPTR(pre);
+    dst_stride *= 2;
+    pre_stride *= 2;
+    w_remain *= 2;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  do {
+    memcpy(dst, pre, w_remain * sizeof(uint8_t));
+    dst += dst_stride;
+    pre += pre_stride;
+  } while (--h_remain);
+}
+
+void av1_build_inter_predictors_sb_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                                 int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize, int block) {
+  // Prediction function used in supertx:
+  // Use the mv at current block (which is less than 8x8)
+  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
+  // bsize can be larger than 8x8.
+  // block (0-3): the sub8x8 location of current block
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+#if CONFIG_EXT_INTER
+  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
+  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
+#endif  // CONFIG_EXT_INTER
+
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
+
+  for (plane = 0; plane < max_plane; plane++) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    build_inter_predictors(xd, plane,
+#if CONFIG_MOTION_VAR
+                           0, 0,
+#endif  // CONFIG_MOTION_VAR
+                           block, bw, bh, 0, 0, bw, bh,
+#if CONFIG_EXT_INTER
+                           wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+                           mi_x, mi_y);
+  }
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    av1_build_interintra_predictors(
+        xd, xd->plane[0].dst.buf, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+        xd->plane[0].dst.stride, xd->plane[1].dst.stride,
+        xd->plane[2].dst.stride, bsize);
+#endif  // CONFIG_EXT_INTER
+}
+
+void av1_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                          int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+#if CONFIG_EXT_INTER
+  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
+  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
+#endif  // CONFIG_EXT_INTER
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_inter_predictors(xd, plane,
+#if CONFIG_MOTION_VAR
+                                 0, 0,
+#endif  // CONFIG_MOTION_VAR
+                                 y * 2 + x, bw, bh, 4 * x, 4 * y, 4, 4,
+#if CONFIG_EXT_INTER
+                                 wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+                                 mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane,
+#if CONFIG_MOTION_VAR
+                             0, 0,
+#endif  // CONFIG_MOTION_VAR
+                             0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_EXT_INTER
+                             wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+                             mi_x, mi_y);
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 #if CONFIG_MOTION_VAR
 // obmc_mask_N[overlap_position]
 static const uint8_t obmc_mask_1[1] = { 55 };
@@ -373,6 +1152,15 @@
                                           56, 57, 58, 59, 60, 60, 61, 62,
                                           62, 63, 63, 64, 64, 64, 64, 64 };
 
+#if CONFIG_EXT_PARTITION
+static const uint8_t obmc_mask_64[64] = {
+  33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+  45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+  56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+  62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+#endif  // CONFIG_EXT_PARTITION
+
 const uint8_t *av1_get_obmc_mask(int length) {
   switch (length) {
     case 1: return obmc_mask_1;
@@ -381,6 +1169,9 @@
     case 8: return obmc_mask_8;
     case 16: return obmc_mask_16;
     case 32: return obmc_mask_32;
+#if CONFIG_EXT_PARTITION
+    case 64: return obmc_mask_64;
+#endif  // CONFIG_EXT_PARTITION
     default: assert(0); return NULL;
   }
 }
@@ -392,98 +1183,120 @@
 void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int mi_row, int mi_col,
                                      uint8_t *above[MAX_MB_PLANE],
-                                     const int above_stride[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
                                      uint8_t *left[MAX_MB_PLANE],
-                                     const int left_stride[MAX_MB_PLANE]) {
-  const TileInfo *const tile = &xd->tile;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  int plane, i, mi_step;
-  const int above_available = mi_row > tile->mi_row_start;
+                                     int left_stride[MAX_MB_PLANE]) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int plane, i;
 #if CONFIG_AOM_HIGHBITDEPTH
-  int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
   // handle above row
-  for (i = 0; above_available && i < AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
-       i += mi_step) {
-    int mi_row_offset = -1;
-    int mi_col_offset = i;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
-    int overlap;
+  if (xd->up_available) {
+    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int miw = AOMMIN(xd->n8_w, cm->mi_cols - mi_col);
+    const int mi_row_offset = -1;
 
-    mi_step = AOMMIN(xd->n8_w, num_8x8_blocks_wide_lookup[mbmi->sb_type]);
+    assert(miw > 0);
 
-    if (!is_neighbor_overlappable(mbmi)) continue;
+    i = 0;
+    do {  // for each mi in the above row
+      const int mi_col_offset = i;
+      const MB_MODE_INFO *const above_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          AOMMIN(xd->n8_w, num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
 
-    overlap = num_4x4_blocks_high_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
-      const int bh = overlap >> pd->subsampling_y;
-      const int dst_stride = pd->dst.stride;
-      uint8_t *dst = &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
-      const int tmp_stride = above_stride[plane];
-      const uint8_t *const tmp =
-          &above[plane][(i * MI_SIZE) >> pd->subsampling_x];
-      const uint8_t *const mask = av1_get_obmc_mask(bh);
+      if (is_neighbor_overlappable(above_mbmi)) {
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+          const int bh = overlap >> pd->subsampling_y;
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst = &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
+          const int tmp_stride = above_stride[plane];
+          const uint8_t *const tmp =
+              &above[plane][(i * MI_SIZE) >> pd->subsampling_x];
+          const uint8_t *const mask = av1_get_obmc_mask(bh);
 
 #if CONFIG_AOM_HIGHBITDEPTH
-      if (is_hbd)
-        aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
-                                   tmp_stride, mask, bh, bw, xd->bd);
-      else
+          if (is_hbd)
+            aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                       tmp_stride, mask, bh, bw, xd->bd);
+          else
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-        aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                            mask, bh, bw);
-    }
-  }  // each mi in the above row
+            aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+                                tmp_stride, mask, bh, bw);
+        }
+      }
+      i += mi_step;
+    } while (i < miw);
+  }
 
   // handle left column
-  if (mi_col - 1 < tile->mi_col_start) return;
+  if (xd->left_available) {
+    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int mih = AOMMIN(xd->n8_h, cm->mi_rows - mi_row);
+    const int mi_col_offset = -1;
 
-  for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
-    int mi_row_offset = i;
-    int mi_col_offset = -1;
-    int overlap;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    assert(mih > 0);
 
-    mi_step = AOMMIN(xd->n8_h, num_8x8_blocks_high_lookup[mbmi->sb_type]);
+    i = 0;
+    do {  // for each mi in the left column
+      const int mi_row_offset = i;
+      const MB_MODE_INFO *const left_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          AOMMIN(xd->n8_h, num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
 
-    if (!is_neighbor_overlappable(mbmi)) continue;
-
-    overlap = num_4x4_blocks_wide_lookup[bsize] << 1;
-
-    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-      const struct macroblockd_plane *pd = &xd->plane[plane];
-      const int bw = overlap >> pd->subsampling_x;
-      const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
-      const int dst_stride = pd->dst.stride;
-      uint8_t *dst =
-          &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
-      const int tmp_stride = left_stride[plane];
-      const uint8_t *const tmp =
-          &left[plane][(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
-      const uint8_t *const mask = av1_get_obmc_mask(bw);
+      if (is_neighbor_overlappable(left_mbmi)) {
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = overlap >> pd->subsampling_x;
+          const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst =
+              &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
+          const int tmp_stride = left_stride[plane];
+          const uint8_t *const tmp =
+              &left[plane][(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
+          const uint8_t *const mask = av1_get_obmc_mask(bw);
 
 #if CONFIG_AOM_HIGHBITDEPTH
-      if (is_hbd)
-        aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
-                                   tmp_stride, mask, bh, bw, xd->bd);
-      else
+          if (is_hbd)
+            aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                       tmp_stride, mask, bh, bw, xd->bd);
+          else
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-        aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
-                            mask, bh, bw);
-    }
-  }  // each mi in the left column
+            aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+                                tmp_stride, mask, bh, bw);
+        }
+      }
+      i += mi_step;
+    } while (i < mih);
+  }
 }
 
+#if CONFIG_EXT_INTER
+void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  if (is_interintra_pred(mbmi)) {
+    mbmi->ref_frame[1] = NONE;
+  } else if (has_second_ref(mbmi) && is_interinter_wedge_used(mbmi->sb_type) &&
+             mbmi->use_wedge_interinter) {
+    mbmi->use_wedge_interinter = 0;
+    mbmi->ref_frame[1] = NONE;
+  }
+  return;
+}
+#endif  // CONFIG_EXT_INTER
+
 void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                          int mi_row, int mi_col,
                                          uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         const int tmp_stride[MAX_MB_PLANE]) {
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
   const TileInfo *const tile = &xd->tile;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   int i, j, mi_step, ref;
@@ -496,20 +1309,29 @@
     int mi_row_offset = -1;
     int mi_col_offset = i;
     int mi_x, mi_y, bw, bh;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MODE_INFO *above_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
 
-    mi_step = AOMMIN(xd->n8_w, num_8x8_blocks_wide_lookup[mbmi->sb_type]);
+    mi_step = AOMMIN(xd->n8_w, num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
 
-    if (!is_neighbor_overlappable(mbmi)) continue;
+    if (!is_neighbor_overlappable(above_mbmi)) continue;
+
+#if CONFIG_EXT_INTER
+    backup_mbmi = *above_mbmi;
+    modify_neighbor_predictor_for_obmc(above_mbmi);
+#endif  // CONFIG_EXT_INTER
 
     for (j = 0; j < MAX_MB_PLANE; ++j) {
       struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_stride[j], 0, i, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
+      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_width[j], tmp_height[j],
+                       tmp_stride[j], 0, i, NULL, pd->subsampling_x,
+                       pd->subsampling_y);
     }
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
+      const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
       const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
       xd->block_refs[ref] = ref_buf;
@@ -528,16 +1350,16 @@
 
     for (j = 0; j < MAX_MB_PLANE; ++j) {
       const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_x;
-      bh = AOMMAX((num_4x4_blocks_high_lookup[bsize] << 1) >> pd->subsampling_y,
+      bw = (mi_step * 8) >> pd->subsampling_x;
+      bh = AOMMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
                   4);
 
-      if (mbmi->sb_type < BLOCK_8X8) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
+      if (above_mbmi->sb_type < BLOCK_8X8) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - above_mbmi->sb_type;
         const int have_vsplit = bp != PARTITION_HORZ;
         const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> (!have_vsplit);
-        const int num_4x4_h = 2 >> (!have_hsplit);
+        const int num_4x4_w = 2 >> !have_vsplit;
+        const int num_4x4_h = 2 >> !have_hsplit;
         const int pw = 8 >> (have_vsplit + pd->subsampling_x);
         int x, y;
 
@@ -546,15 +1368,26 @@
             if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT) && y == 0)
               continue;
 
-            build_inter_predictors(
-                xd, j, mi_col_offset, mi_row_offset, y * 2 + x, bw, bh,
-                (4 * x) >> pd->subsampling_x, 0, pw, bh, mi_x, mi_y);
+            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                                   y * 2 + x, bw, bh,
+                                   (4 * x) >> pd->subsampling_x, 0, pw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   mi_x, mi_y);
           }
       } else {
         build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
-                               0, 0, bw, bh, mi_x, mi_y);
+                               0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
       }
     }
+#if CONFIG_EXT_INTER
+    *above_mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
   }
   xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
   xd->mb_to_right_edge = mb_to_right_edge_base;
@@ -564,33 +1397,44 @@
 void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                         int mi_row, int mi_col,
                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                        const int tmp_stride[MAX_MB_PLANE]) {
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]) {
   const TileInfo *const tile = &xd->tile;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   int i, j, mi_step, ref;
   int mb_to_bottom_edge_base = xd->mb_to_bottom_edge;
 
-  if (mi_col - 1 < tile->mi_col_start) return;
+  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start)) return;
 
   xd->mb_to_right_edge += xd->n8_w * 32;
   for (i = 0; i < AOMMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
     int mi_row_offset = i;
     int mi_col_offset = -1;
     int mi_x, mi_y, bw, bh;
-    MODE_INFO *mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
-    MB_MODE_INFO *mbmi = &mi->mbmi;
+    MODE_INFO *left_mi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
 
-    mi_step = AOMMIN(xd->n8_h, num_8x8_blocks_high_lookup[mbmi->sb_type]);
+    mi_step = AOMMIN(xd->n8_h, num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
 
-    if (!is_neighbor_overlappable(mbmi)) continue;
+    if (!is_neighbor_overlappable(left_mbmi)) continue;
+
+#if CONFIG_EXT_INTER
+    backup_mbmi = *left_mbmi;
+    modify_neighbor_predictor_for_obmc(left_mbmi);
+#endif  // CONFIG_EXT_INTER
 
     for (j = 0; j < MAX_MB_PLANE; ++j) {
       struct macroblockd_plane *const pd = &xd->plane[j];
-      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_stride[j], i, 0, NULL,
-                       pd->subsampling_x, pd->subsampling_y);
+      setup_pred_plane(&pd->dst, tmp_buf[j], tmp_width[j], tmp_height[j],
+                       tmp_stride[j], i, 0, NULL, pd->subsampling_x,
+                       pd->subsampling_y);
     }
-    for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
-      const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
+      const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
       const RefBuffer *const ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
       xd->block_refs[ref] = ref_buf;
@@ -609,16 +1453,16 @@
 
     for (j = 0; j < MAX_MB_PLANE; ++j) {
       const struct macroblockd_plane *pd = &xd->plane[j];
-      bw = AOMMAX((num_4x4_blocks_wide_lookup[bsize] << 1) >> pd->subsampling_x,
+      bw = AOMMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
                   4);
       bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
 
-      if (mbmi->sb_type < BLOCK_8X8) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - mbmi->sb_type;
+      if (left_mbmi->sb_type < BLOCK_8X8) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - left_mbmi->sb_type;
         const int have_vsplit = bp != PARTITION_HORZ;
         const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> (!have_vsplit);
-        const int num_4x4_h = 2 >> (!have_hsplit);
+        const int num_4x4_w = 2 >> !have_vsplit;
+        const int num_4x4_h = 2 >> !have_hsplit;
         const int ph = 8 >> (have_hsplit + pd->subsampling_y);
         int x, y;
 
@@ -627,15 +1471,26 @@
             if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT) && x == 0)
               continue;
 
-            build_inter_predictors(
-                xd, j, mi_col_offset, mi_row_offset, y * 2 + x, bw, bh, 0,
-                (4 * y) >> pd->subsampling_y, bw, ph, mi_x, mi_y);
+            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                                   y * 2 + x, bw, bh, 0,
+                                   (4 * y) >> pd->subsampling_y, bw, ph,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   mi_x, mi_y);
           }
       } else {
         build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0, bw, bh,
-                               0, 0, bw, bh, mi_x, mi_y);
+                               0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
       }
     }
+#if CONFIG_EXT_INTER
+    *left_mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
   }
   xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
   xd->mb_to_bottom_edge = mb_to_bottom_edge_base;
@@ -652,10 +1507,12 @@
   DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
-  const int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE,
-                                          MAX_SB_SIZE };
-  const int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE,
-                                          MAX_SB_SIZE };
+  int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
 #if CONFIG_AOM_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -678,11 +1535,574 @@
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
   av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                      dst_stride1);
+                                      dst_width1, dst_height1, dst_stride1);
   av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                     dst_stride2);
+                                     dst_width2, dst_height2, dst_stride2);
   av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
   av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
                                   dst_buf2, dst_stride2);
 }
 #endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  102, 100, 97, 95, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 73, 71, 69, 68, 67,
+  65,  64,  62, 61, 60, 59, 58, 57, 55, 54, 53, 52, 52, 51, 50, 49, 48, 47, 47,
+  46,  45,  45, 44, 43, 43, 42, 41, 41, 40, 40, 39, 39, 38, 38, 38, 37, 37, 36,
+  36,  36,  35, 35, 35, 34, 34, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 31,
+  31,  31,  31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 28,
+  28,  28,  28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27,
+  27,  27,  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+};
+static int ii_size_scales[BLOCK_SIZES] = { 32, 16, 16, 16, 8, 8, 8, 4,
+                                           4,  4,  2,  2,  2, 1, 1, 1 };
+#else
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  102, 100, 97, 95, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 73, 71,
+  69,  68,  67, 65, 64, 62, 61, 60, 59, 58, 57, 55, 54, 53, 52, 52,
+  51,  50,  49, 48, 47, 47, 46, 45, 45, 44, 43, 43, 42, 41, 41, 40,
+  40,  39,  39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 35, 34, 34,
+};
+static int ii_size_scales[BLOCK_SIZES] = { 16, 8, 8, 8, 4, 4, 4,
+                                           2,  2, 2, 1, 1, 1 };
+#endif  // CONFIG_EXT_PARTITION
+
+static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
+                               int wedge_index, int wedge_sign,
+                               BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+                               uint8_t *comppred, int compstride,
+                               const uint8_t *interpred, int interstride,
+                               const uint8_t *intrapred, int intrastride) {
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+  int i, j;
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
+      aom_blend_a64_mask(
+          comppred, compstride, intrapred, intrastride, interpred, interstride,
+          mask, 4 * num_4x4_blocks_wide_lookup[bsize], bh, bw, subh, subw);
+    }
+    return;
+  }
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[i * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D63_PRED:
+    case II_D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D207_PRED:
+    case II_D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale =
+              (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >>
+              1;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_TM_PRED:
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          comppred[i * compstride + j] = AOM_BLEND_AVG(
+              intrapred[i * intrastride + j], interpred[i * interstride + j]);
+        }
+      }
+      break;
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static void combine_interintra_highbd(
+    INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
+    int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+    uint8_t *comppred8, int compstride, const uint8_t *interpred8,
+    int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+  int i, j;
+
+  uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
+  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
+  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask =
+          av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
+      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+                                interpred8, interstride, mask, bw, bh, bw, subh,
+                                subw, bd);
+    }
+    return;
+  }
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[i * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D63_PRED:
+    case II_D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D207_PRED:
+    case II_D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >>
+                      2;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale =
+              (ii_weights1d[i * size_scale] + ii_weights1d[j * size_scale]) >>
+              1;
+          comppred[i * compstride + j] =
+              AOM_BLEND_A256(scale, intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_TM_PRED:
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          comppred[i * compstride + j] = AOM_BLEND_AVG(
+              interpred[i * interstride + j], intrapred[i * intrastride + j]);
+        }
+      }
+      break;
+  }
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+// Break down rectangular intra prediction for joint spatio-temporal prediction
+// into two square intra predictions.
+static void build_intra_predictors_for_interintra(MACROBLOCKD *xd, uint8_t *ref,
+                                                  int ref_stride, uint8_t *dst,
+                                                  int dst_stride,
+                                                  PREDICTION_MODE mode,
+                                                  BLOCK_SIZE bsize, int plane) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+  const int bwl = b_width_log2_lookup[plane_bsize];
+  const int bhl = b_height_log2_lookup[plane_bsize];
+  const int pxbw = 4 << bwl;
+  const int pxbh = 4 << bhl;
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+
+  if (bwl == bhl) {
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
+
+  } else if (bwl < bhl) {
+    uint8_t *src_2 = ref + pxbw * ref_stride;
+    uint8_t *dst_2 = dst + pxbw * dst_stride;
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
+      uint16_t *dst_216 = CONVERT_TO_SHORTPTR(dst_2);
+      memcpy(src_216 - ref_stride, dst_216 - dst_stride,
+             sizeof(*src_216) * pxbw);
+    } else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    {
+      memcpy(src_2 - ref_stride, dst_2 - dst_stride, sizeof(*src_2) * pxbw);
+    }
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+                            ref_stride, dst_2, dst_stride, 0, 1 << bwl, plane);
+  } else {  // bwl > bhl
+    int i;
+    uint8_t *src_2 = ref + pxbh;
+    uint8_t *dst_2 = dst + pxbh;
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, ref,
+                            ref_stride, dst, dst_stride, 0, 0, plane);
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
+      uint16_t *dst_216 = CONVERT_TO_SHORTPTR(dst_2);
+      for (i = 0; i < pxbh; ++i)
+        src_216[i * ref_stride - 1] = dst_216[i * dst_stride - 1];
+    } else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    {
+      for (i = 0; i < pxbh; ++i)
+        src_2[i * ref_stride - 1] = dst_2[i * dst_stride - 1];
+    }
+    av1_predict_intra_block(xd, pd->width, pd->height, max_tx_size, mode, src_2,
+                            ref_stride, dst_2, dst_stride, 1 << bhl, 0, plane);
+  }
+}
+
+void av1_build_intra_predictors_for_interintra(MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               uint8_t *dst, int dst_stride) {
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, dst,
+      dst_stride, interintra_to_intra_mode[xd->mi[0]->mbmi.interintra_mode],
+      bsize, plane);
+}
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    combine_interintra_highbd(
+        xd->mi[0]->mbmi.interintra_mode, xd->mi[0]->mbmi.use_wedge_interintra,
+        xd->mi[0]->mbmi.interintra_wedge_index,
+        xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
+        xd->plane[plane].dst.buf, xd->plane[plane].dst.stride, inter_pred,
+        inter_stride, intra_pred, intra_stride, xd->bd);
+    return;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
+                     xd->mi[0]->mbmi.use_wedge_interintra,
+                     xd->mi[0]->mbmi.interintra_wedge_index,
+                     xd->mi[0]->mbmi.interintra_wedge_sign, bsize, plane_bsize,
+                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+                     inter_pred, inter_stride, intra_pred, intra_stride);
+}
+
+void av1_build_interintra_predictors_sby(MACROBLOCKD *xd, uint8_t *ypred,
+                                         int ystride, BLOCK_SIZE bsize) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(
+        xd, bsize, 0, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, 0, ypred, ystride,
+                           CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+    return;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  {
+    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(xd, bsize, 0, intrapredictor,
+                                              MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, 0, ypred, ystride, intrapredictor,
+                           MAX_SB_SIZE);
+  }
+}
+
+void av1_build_interintra_predictors_sbc(MACROBLOCKD *xd, uint8_t *upred,
+                                         int ustride, int plane,
+                                         BLOCK_SIZE bsize) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, uintrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(
+        xd, bsize, plane, CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, upred, ustride,
+                           CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+    return;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  {
+    DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
+    av1_build_intra_predictors_for_interintra(xd, bsize, plane, uintrapredictor,
+                                              MAX_SB_SIZE);
+    av1_combine_interintra(xd, bsize, plane, upred, ustride, uintrapredictor,
+                           MAX_SB_SIZE);
+  }
+}
+
+void av1_build_interintra_predictors_sbuv(MACROBLOCKD *xd, uint8_t *upred,
+                                          uint8_t *vpred, int ustride,
+                                          int vstride, BLOCK_SIZE bsize) {
+  av1_build_interintra_predictors_sbc(xd, upred, ustride, 1, bsize);
+  av1_build_interintra_predictors_sbc(xd, vpred, vstride, 2, bsize);
+}
+
+void av1_build_interintra_predictors(MACROBLOCKD *xd, uint8_t *ypred,
+                                     uint8_t *upred, uint8_t *vpred,
+                                     int ystride, int ustride, int vstride,
+                                     BLOCK_SIZE bsize) {
+  av1_build_interintra_predictors_sby(xd, ypred, ystride, bsize);
+  av1_build_interintra_predictors_sbuv(xd, upred, vpred, ustride, vstride,
+                                       bsize);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+                                              int block, int bw, int bh, int x,
+                                              int y, int w, int h, int mi_x,
+                                              int mi_y, int ref,
+                                              uint8_t *const ext_dst,
+                                              int ext_dst_stride) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+
+  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+  struct buf_2d *const pre_buf = &pd->pre[ref];
+#if CONFIG_AOM_HIGHBITDEPTH
+  uint8_t *const dst =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? CONVERT_TO_BYTEPTR(ext_dst)
+                                                   : ext_dst) +
+      ext_dst_stride * y + x;
+#else
+  uint8_t *const dst = ext_dst + ext_dst_stride * y + x;
+#endif
+  const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+                    ? average_split_mvs(pd, mi, ref, block)
+                    : mi->mbmi.mv[ref].as_mv;
+
+  // TODO(jkoleszar): This clamping is done in the incorrect place for the
+  // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+  // MV. Note however that it performs the subsampling aware scaling so
+  // that the result is always q4.
+  // mv_precision precision is MV_PRECISION_Q4.
+  const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
+                                             pd->subsampling_y);
+
+  uint8_t *pre;
+  MV32 scaled_mv;
+  int xs, ys, subpel_x, subpel_y;
+  const int is_scaled = av1_is_scaled(sf);
+
+  if (is_scaled) {
+    pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+    scaled_mv = av1_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    pre = pre_buf->buf + (y * pre_buf->stride + x);
+    scaled_mv.row = mv_q4.row;
+    scaled_mv.col = mv_q4.col;
+    xs = ys = 16;
+  }
+
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+  pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+         (scaled_mv.col >> SUBPEL_BITS);
+
+  av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride, subpel_x,
+                           subpel_y, sf, w, h, 0, mi->mbmi.interp_filter, xs,
+                           ys, xd);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_inter_predictors_single_buf(
+              xd, plane, y * 2 + x, bw, bh, 4 * x, 4 * y, 4, 4, mi_x, mi_y, ref,
+              ext_dst[plane], ext_dst_stride[plane]);
+    } else {
+      build_inter_predictors_single_buf(xd, plane, 0, bw, bh, 0, 0, bw, bh,
+                                        mi_x, mi_y, ref, ext_dst[plane],
+                                        ext_dst_stride[plane]);
+    }
+  }
+}
+
+static void build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+    int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+
+  if (is_compound && is_interinter_wedge_used(mbmi->sb_type) &&
+      mbmi->use_wedge_interinter) {
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_wedge_highbd(
+          dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1,
+          mbmi->interinter_wedge_index, mbmi->interinter_wedge_sign,
+          mbmi->sb_type, h, w, xd->bd);
+    else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      build_masked_compound_wedge(
+          dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1,
+          ext_dst_stride1, mbmi->interinter_wedge_index,
+          mbmi->interinter_wedge_sign, mbmi->sb_type, h, w);
+  } else {
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+                               xd->bd);
+    else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+                        0, NULL, 0, w, h);
+  }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]) {
+  int plane;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize =
+        get_plane_block_size(bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_wedge_inter_predictor_from_buf(
+              xd, plane, 4 * x, 4 * y, 4, 4, ext_dst0[plane],
+              ext_dst_stride0[plane], ext_dst1[plane], ext_dst_stride1[plane]);
+    } else {
+      const int bw = 4 * num_4x4_w;
+      const int bh = 4 * num_4x4_h;
+      build_wedge_inter_predictor_from_buf(
+          xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+          ext_dst1[plane], ext_dst_stride1[plane]);
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER

diff --git a/av1/common/reconinter.h b/av1/common/reconinter.h
index 10e6c4b..13f581e 100644
--- a/av1/common/reconinter.h
+++ b/av1/common/reconinter.h

@@ -12,11 +12,10 @@
 #ifndef AV1_COMMON_RECONINTER_H_
 #define AV1_COMMON_RECONINTER_H_
 
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_filter.h"
-#include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "aom/aom_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,49 +25,244 @@
                                    uint8_t *dst, int dst_stride,
                                    const int subpel_x, const int subpel_y,
                                    const struct scale_factors *sf, int w, int h,
-                                   int ref, const InterpFilter *interp_filter,
+                                   int ref_idx,
+#if CONFIG_DUAL_FILTER
+                                   const InterpFilter *interp_filter,
+#else
+                                   const InterpFilter interp_filter,
+#endif
                                    int xs, int ys) {
+#if CONFIG_DUAL_FILTER
+  InterpFilterParams interp_filter_params_x =
+      av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+  InterpFilterParams interp_filter_params_y =
+      av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+#else
   InterpFilterParams interp_filter_params =
-      get_interp_filter_params(*interp_filter);
-  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
-    const int16_t *filter_x =
-        get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
-    const int16_t *filter_y =
-        get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+      av1_get_interp_filter_params(interp_filter);
+#endif
 
-    sf->predict[subpel_x != 0][subpel_y != 0][ref](
-        src, src_stride, dst, dst_stride, filter_x, xs, filter_y, ys, w, h);
+#if CONFIG_DUAL_FILTER
+  if (interp_filter_params_x.taps == SUBPEL_TAPS &&
+      interp_filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
+#else
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+#endif
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+    if (IsInterpolatingFilter(interp_filter)) {
+      // Interpolating filter
+      sf->predict[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h);
+    } else {
+      sf->predict_ni[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h);
+    }
+#else
+    sf->predict[subpel_x != 0][subpel_y != 0][ref_idx](
+        src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h);
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
   } else {
+    // ref_idx > 0 means this is the second reference frame
+    // first reference frame's prediction result is already in dst
+    // therefore we need to average the first and second results
     av1_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter,
-                 subpel_x, xs, subpel_y, ys, ref);
+                 subpel_x, xs, subpel_y, ys, ref_idx);
   }
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
-static INLINE void high_inter_predictor(const uint8_t *src, int src_stride,
-                                        uint8_t *dst, int dst_stride,
-                                        const int subpel_x, const int subpel_y,
-                                        const struct scale_factors *sf, int w,
-                                        int h, int ref,
-                                        const InterpFilter *interp_filter,
-                                        int xs, int ys, int bd) {
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const int subpel_x,
+                                          const int subpel_y,
+                                          const struct scale_factors *sf, int w,
+                                          int h, int ref,
+#if CONFIG_DUAL_FILTER
+                                          const InterpFilter *interp_filter,
+#else
+                                          const InterpFilter interp_filter,
+#endif
+                                          int xs, int ys, int bd) {
+#if CONFIG_DUAL_FILTER
+  InterpFilterParams interp_filter_params_x =
+      av1_get_interp_filter_params(interp_filter[1 + 2 * ref]);
+  InterpFilterParams interp_filter_params_y =
+      av1_get_interp_filter_params(interp_filter[0 + 2 * ref]);
+#else
   InterpFilterParams interp_filter_params =
-      get_interp_filter_params(*interp_filter);
-  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
-    const int16_t *filter_x =
-        get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
-    const int16_t *filter_y =
-        get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+      av1_get_interp_filter_params(interp_filter);
+#endif
 
+#if CONFIG_DUAL_FILTER
+  if (interp_filter_params_x.taps == SUBPEL_TAPS &&
+      interp_filter_params_y.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
+#else
+  if (interp_filter_params.taps == SUBPEL_TAPS && w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
+    const int16_t *kernel_y =
+        av1_get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+#endif  // CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+    if (IsInterpolatingFilter(interp_filter)) {
+      // Interpolating filter
+      sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h,
+          bd);
+    } else {
+      sf->highbd_predict_ni[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h,
+          bd);
+    }
+#else
     sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-        src, src_stride, dst, dst_stride, filter_x, xs, filter_y, ys, w, h, bd);
+        src, src_stride, dst, dst_stride, kernel_x, xs, kernel_y, ys, w, h, bd);
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
   } else {
+    // ref > 0 means this is the second reference frame
+    // first reference frame's prediction result is already in dst
+    // therefore we need to average the first and second results
+    int avg = ref > 0;
     av1_highbd_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter,
-                        subpel_x, xs, subpel_y, ys, ref, bd);
+                        subpel_x, xs, subpel_y, ys, avg, bd);
   }
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
+#if CONFIG_EXT_INTER
+// Set to one to use larger codebooks
+#define USE_LARGE_WEDGE_CODEBOOK 0
+
+#if USE_LARGE_WEDGE_CODEBOOK
+#define MAX_WEDGE_TYPES (1 << 5)
+#else
+#define MAX_WEDGE_TYPES (1 << 4)
+#endif
+
+#define MAX_WEDGE_SIZE_LOG2 5  // 32x32
+#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
+#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE -1
+
+// Angles are with respect to horizontal anti-clockwise
+typedef enum {
+  WEDGE_HORIZONTAL = 0,
+  WEDGE_VERTICAL = 1,
+  WEDGE_OBLIQUE27 = 2,
+  WEDGE_OBLIQUE63 = 3,
+  WEDGE_OBLIQUE117 = 4,
+  WEDGE_OBLIQUE153 = 5,
+  WEDGE_DIRECTIONS
+} WedgeDirectionType;
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+  WedgeDirectionType direction;
+  int x_offset;
+  int y_offset;
+} wedge_code_type;
+
+typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
+
+typedef struct {
+  int bits;
+  const wedge_code_type *codebook;
+  uint8_t *signflip;
+  int smoother;
+  wedge_masks_type *masks;
+} wedge_params_type;
+
+extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES];
+
+static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+
+static INLINE int is_interinter_wedge_used(BLOCK_SIZE sb_type) {
+  (void)sb_type;
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
+  const int wbits = wedge_params_lookup[sb_type].bits;
+  return (wbits > 0) ? wbits + 1 : 0;
+}
+
+static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
+  (void)sb_type;
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+#endif  // CONFIG_EXT_INTER
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane,
+#if CONFIG_MOTION_VAR
+                            int mi_col_offset, int mi_row_offset,
+#endif  // CONFIG_MOTION_VAR
+                            int block, int bw, int bh, int x, int y, int w,
+                            int h,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int mi_x, int mi_y);
+
+static INLINE void av1_make_inter_predictor(
+    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
+    const int subpel_x, const int subpel_y, const struct scale_factors *sf,
+    int w, int h, int ref,
+#if CONFIG_DUAL_FILTER
+    const InterpFilter *interp_filter,
+#else
+    const InterpFilter interp_filter,
+#endif
+    int xs, int ys, const MACROBLOCKD *xd) {
+  (void)xd;
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                           sf, w, h, ref, interp_filter, xs, ys, xd->bd);
+  else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y, sf, w,
+                    h, ref, interp_filter, xs, ys);
+}
+
+#if CONFIG_EXT_INTER
+void av1_make_masked_inter_predictor(const uint8_t *pre, int pre_stride,
+                                     uint8_t *dst, int dst_stride,
+                                     const int subpel_x, const int subpel_y,
+                                     const struct scale_factors *sf, int w,
+                                     int h,
+#if CONFIG_DUAL_FILTER
+                                     const InterpFilter *interp_filter,
+#else
+                                     const InterpFilter interp_filter,
+#endif
+                                     int xs, int ys,
+#if CONFIG_SUPERTX
+                                     int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                                     const MACROBLOCKD *xd);
+#endif  // CONFIG_EXT_INTER
+
 static INLINE int round_mv_comp_q4(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
@@ -135,13 +329,6 @@
   return res;
 }
 
-void build_inter_predictors(MACROBLOCKD *xd, int plane,
-#if CONFIG_MOTION_VAR
-                            int mi_col_offset, int mi_row_offset,
-#endif  // CONFIG_MOTION_VAR
-                            int block, int bw, int bh, int x, int y, int w,
-                            int h, int mi_x, int mi_y);
-
 void av1_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane, int i, int ir,
                                       int ic, int mi_row, int mi_col);
 
@@ -157,18 +344,49 @@
 void av1_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
+#if CONFIG_SUPERTX
+void av1_build_inter_predictors_sb_sub8x8_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                                 int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                 int mi_row, int mi_col,
+                                                 BLOCK_SIZE bsize, int block);
+
+void av1_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                          int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                          int mi_row, int mi_col,
+                                          BLOCK_SIZE bsize);
+struct macroblockd_plane;
+void av1_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd, uint8_t *dst, int dst_stride, const uint8_t *pre,
+    int pre_stride, int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, PARTITION_TYPE partition,
+    int plane);
+#endif  // CONFIG_SUPERTX
+
 void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
                                int dst_stride, const MV *mv_q3,
                                const struct scale_factors *sf, int w, int h,
-                               int do_avg, const InterpFilter *interp_filter,
+                               int do_avg,
+#if CONFIG_DUAL_FILTER
+                               const InterpFilter *interp_filter,
+#else
+                               const InterpFilter interp_filter,
+#endif
                                enum mv_precision precision, int x, int y);
 
 #if CONFIG_AOM_HIGHBITDEPTH
 void av1_highbd_build_inter_predictor(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride,
     const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
-    const InterpFilter *interp_filter, enum mv_precision precision, int x,
-    int y, int bd);
+#if CONFIG_DUAL_FILTER
+    const InterpFilter *interp_filter,
+#else
+    const InterpFilter interp_filter,
+#endif
+    enum mv_precision precision, int x, int y, int bd);
 #endif
 
 static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
@@ -178,13 +396,17 @@
   return y * stride + x;
 }
 
-static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src,
-                                    int stride, int mi_row, int mi_col,
+static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src, int width,
+                                    int height, int stride, int mi_row,
+                                    int mi_col,
                                     const struct scale_factors *scale,
                                     int subsampling_x, int subsampling_y) {
   const int x = (MI_SIZE * mi_col) >> subsampling_x;
   const int y = (MI_SIZE * mi_row) >> subsampling_y;
   dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+  dst->buf0 = src;
+  dst->width = width;
+  dst->height = height;
   dst->stride = stride;
 }
 
@@ -196,31 +418,15 @@
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *sf);
 
-#if CONFIG_MOTION_VAR
-const uint8_t *av1_get_obmc_mask(int length);
-void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col,
-                                     uint8_t *above[MAX_MB_PLANE],
-                                     const int above_stride[MAX_MB_PLANE],
-                                     uint8_t *left[MAX_MB_PLANE],
-                                     const int left_stride[MAX_MB_PLANE]);
-void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                         int mi_row, int mi_col,
-                                         uint8_t *tmp_buf[MAX_MB_PLANE],
-                                         const int tmp_stride[MAX_MB_PLANE]);
-void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col,
-                                        uint8_t *tmp_buf[MAX_MB_PLANE],
-                                        const int tmp_stride[MAX_MB_PLANE]);
-void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
-                                        int mi_row, int mi_col);
-#endif  // CONFIG_MOTION_VAR
+// Detect if the block have sub-pixel level motion vectors
+// per component.
 static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
                                           const MACROBLOCKD *const xd,
                                           int dir) {
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
-  const int ref = (dir >> 1);
+  int plane;
+  int ref = (dir >> 1);
 
   if (bsize >= BLOCK_8X8) {
     if (dir & 0x01) {
@@ -229,7 +435,6 @@
       if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
     }
   } else {
-    int plane;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
       const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -251,11 +456,12 @@
       }
     }
   }
+
   return 0;
 }
 
 #define CHECK_SUBPEL 0
-static INLINE int is_interp_needed(const MACROBLOCKD *const xd) {
+static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
 #if CHECK_SUBPEL
   MODE_INFO *const mi = xd->mi[0];
   const int is_compound = has_second_ref(&mi->mbmi);
@@ -276,6 +482,84 @@
 #endif
 }
 
+#if CONFIG_MOTION_VAR
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int mi_row, int mi_col,
+                                     uint8_t *above[MAX_MB_PLANE],
+                                     int above_stride[MAX_MB_PLANE],
+                                     uint8_t *left[MAX_MB_PLANE],
+                                     int left_stride[MAX_MB_PLANE]);
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_width[MAX_MB_PLANE],
+                                         int tmp_height[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col,
+                                        uint8_t *tmp_buf[MAX_MB_PLANE],
+                                        int tmp_width[MAX_MB_PLANE],
+                                        int tmp_height[MAX_MB_PLANE],
+                                        int tmp_stride[MAX_MB_PLANE]);
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+                                        int mi_row, int mi_col);
+#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+#define MASK_MASTER_SIZE (2 * MAX_SB_SIZE)
+#define MASK_MASTER_STRIDE (2 * MAX_SB_SIZE)
+
+void av1_init_wedge_masks();
+
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
+                                                          int wedge_sign,
+                                                          BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
+const uint8_t *av1_get_soft_mask(int wedge_index, int wedge_sign,
+                                 BLOCK_SIZE sb_type, int wedge_offset_x,
+                                 int wedge_offset_y);
+
+void av1_build_interintra_predictors(MACROBLOCKD *xd, uint8_t *ypred,
+                                     uint8_t *upred, uint8_t *vpred,
+                                     int ystride, int ustride, int vstride,
+                                     BLOCK_SIZE bsize);
+void av1_build_interintra_predictors_sby(MACROBLOCKD *xd, uint8_t *ypred,
+                                         int ystride, BLOCK_SIZE bsize);
+void av1_build_interintra_predictors_sbc(MACROBLOCKD *xd, uint8_t *upred,
+                                         int ustride, int plane,
+                                         BLOCK_SIZE bsize);
+void av1_build_interintra_predictors_sbuv(MACROBLOCKD *xd, uint8_t *upred,
+                                          uint8_t *vpred, int ustride,
+                                          int vstride, BLOCK_SIZE bsize);
+
+void av1_build_intra_predictors_for_interintra(MACROBLOCKD *xd,
+                                               BLOCK_SIZE bsize, int plane,
+                                               uint8_t *intra_pred,
+                                               int intra_stride);
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+                            const uint8_t *inter_pred, int inter_stride,
+                            const uint8_t *intra_pred, int intra_stride);
+void av1_build_interintra_predictors_sbuv(MACROBLOCKD *xd, uint8_t *upred,
+                                          uint8_t *vpred, int ustride,
+                                          int vstride, BLOCK_SIZE bsize);
+void av1_build_interintra_predictors_sby(MACROBLOCKD *xd, uint8_t *ypred,
+                                         int ystride, BLOCK_SIZE bsize);
+
+// Encoder only
+void av1_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+    int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3]);
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+                                              int plane_from, int plane_to,
+                                              uint8_t *ext_dst0[3],
+                                              int ext_dst_stride0[3],
+                                              uint8_t *ext_dst1[3],
+                                              int ext_dst_stride1[3]);
+#endif  // CONFIG_EXT_INTER
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index fede6ca..5bb232d 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c

@@ -9,8 +9,12 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include <math.h>
+
+#include "./av1_rtcd.h"
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
+#include "aom_ports/system_state.h"
 
 #if CONFIG_AOM_HIGHBITDEPTH
 #include "aom_dsp/aom_dsp_common.h"
@@ -18,7 +22,6 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/aom_once.h"
-
 #include "av1/common/reconintra.h"
 #include "av1/common/onyxc_int.h"
 
@@ -43,45 +46,189 @@
   NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
 };
 
-static const uint8_t orders_64x64[1] = { 0 };
-static const uint8_t orders_64x32[2] = { 0, 1 };
-static const uint8_t orders_32x64[2] = { 0, 1 };
-static const uint8_t orders_32x32[4] = {
+static const uint8_t orders_128x128[1] = { 0 };
+static const uint8_t orders_128x64[2] = { 0, 1 };
+static const uint8_t orders_64x128[2] = { 0, 1 };
+static const uint8_t orders_64x64[4] = {
   0, 1, 2, 3,
 };
-static const uint8_t orders_32x16[8] = {
+static const uint8_t orders_64x32[8] = {
   0, 2, 1, 3, 4, 6, 5, 7,
 };
-static const uint8_t orders_16x32[8] = {
+static const uint8_t orders_32x64[8] = {
   0, 1, 2, 3, 4, 5, 6, 7,
 };
-static const uint8_t orders_16x16[16] = {
+static const uint8_t orders_32x32[16] = {
   0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15,
 };
-static const uint8_t orders_16x8[32] = {
+static const uint8_t orders_32x16[32] = {
   0,  2,  8,  10, 1,  3,  9,  11, 4,  6,  12, 14, 5,  7,  13, 15,
   16, 18, 24, 26, 17, 19, 25, 27, 20, 22, 28, 30, 21, 23, 29, 31,
 };
-static const uint8_t orders_8x16[32] = {
+static const uint8_t orders_16x32[32] = {
   0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,
   16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31,
 };
-static const uint8_t orders_8x8[64] = {
+static const uint8_t orders_16x16[64] = {
   0,  1,  4,  5,  16, 17, 20, 21, 2,  3,  6,  7,  18, 19, 22, 23,
   8,  9,  12, 13, 24, 25, 28, 29, 10, 11, 14, 15, 26, 27, 30, 31,
   32, 33, 36, 37, 48, 49, 52, 53, 34, 35, 38, 39, 50, 51, 54, 55,
   40, 41, 44, 45, 56, 57, 60, 61, 42, 43, 46, 47, 58, 59, 62, 63,
 };
-static const uint8_t *const orders[BLOCK_SIZES] = {
-  orders_8x8,   orders_8x8,   orders_8x8,   orders_8x8,   orders_8x16,
-  orders_16x8,  orders_16x16, orders_16x32, orders_32x16, orders_32x32,
-  orders_32x64, orders_64x32, orders_64x64,
+
+#if CONFIG_EXT_PARTITION
+static const uint8_t orders_16x8[128] = {
+  0,  2,  8,  10, 32,  34,  40,  42,  1,  3,  9,  11, 33,  35,  41,  43,
+  4,  6,  12, 14, 36,  38,  44,  46,  5,  7,  13, 15, 37,  39,  45,  47,
+  16, 18, 24, 26, 48,  50,  56,  58,  17, 19, 25, 27, 49,  51,  57,  59,
+  20, 22, 28, 30, 52,  54,  60,  62,  21, 23, 29, 31, 53,  55,  61,  63,
+  64, 66, 72, 74, 96,  98,  104, 106, 65, 67, 73, 75, 97,  99,  105, 107,
+  68, 70, 76, 78, 100, 102, 108, 110, 69, 71, 77, 79, 101, 103, 109, 111,
+  80, 82, 88, 90, 112, 114, 120, 122, 81, 83, 89, 91, 113, 115, 121, 123,
+  84, 86, 92, 94, 116, 118, 124, 126, 85, 87, 93, 95, 117, 119, 125, 127,
+};
+static const uint8_t orders_8x16[128] = {
+  0,  1,  2,  3,  8,  9,  10, 11, 32,  33,  34,  35,  40,  41,  42,  43,
+  4,  5,  6,  7,  12, 13, 14, 15, 36,  37,  38,  39,  44,  45,  46,  47,
+  16, 17, 18, 19, 24, 25, 26, 27, 48,  49,  50,  51,  56,  57,  58,  59,
+  20, 21, 22, 23, 28, 29, 30, 31, 52,  53,  54,  55,  60,  61,  62,  63,
+  64, 65, 66, 67, 72, 73, 74, 75, 96,  97,  98,  99,  104, 105, 106, 107,
+  68, 69, 70, 71, 76, 77, 78, 79, 100, 101, 102, 103, 108, 109, 110, 111,
+  80, 81, 82, 83, 88, 89, 90, 91, 112, 113, 114, 115, 120, 121, 122, 123,
+  84, 85, 86, 87, 92, 93, 94, 95, 116, 117, 118, 119, 124, 125, 126, 127,
+};
+static const uint8_t orders_8x8[256] = {
+  0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
+  85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
+  86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
+  89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
+  90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
+  101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
+  102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
+  105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
+  106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+  149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+  150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+  153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+  154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+  165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+  166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+  169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+  170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+  255,
 };
 
+/* clang-format off */
+static const uint8_t *const orders[BLOCK_SIZES] = {
+  //                              4X4
+                                  orders_8x8,
+  // 4X8,         8X4,            8X8
+  orders_8x8,     orders_8x8,     orders_8x8,
+  // 8X16,        16X8,           16X16
+  orders_8x16,    orders_16x8,    orders_16x16,
+  // 16X32,       32X16,          32X32
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 32X64,       64X32,          64X64
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 64x128,      128x64,         128x128
+  orders_64x128,  orders_128x64,  orders_128x128
+};
+/* clang-format on */
+#else
+/* clang-format off */
+static const uint8_t *const orders[BLOCK_SIZES] = {
+  //                              4X4
+                                  orders_16x16,
+  // 4X8,         8X4,            8X8
+  orders_16x16,   orders_16x16,   orders_16x16,
+  // 8X16,        16X8,           16X16
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 16X32,       32X16,          32X32
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 32X64,       64X32,          64X64
+  orders_64x128,  orders_128x64,  orders_128x128
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const uint8_t orders_verta_64x64[4] = {
+  0, 2, 1, 2,
+};
+static const uint8_t orders_verta_32x32[16] = {
+  0, 2, 4, 6, 1, 2, 5, 6, 8, 10, 12, 14, 9, 10, 13, 14,
+};
+static const uint8_t orders_verta_16x16[64] = {
+  0,  2,  4,  6,  16, 18, 20, 22, 1,  2,  5,  6,  17, 18, 21, 22,
+  8,  10, 12, 14, 24, 26, 28, 30, 9,  10, 13, 14, 25, 26, 29, 30,
+  32, 34, 36, 38, 48, 50, 52, 54, 33, 34, 37, 38, 49, 50, 53, 54,
+  40, 42, 44, 46, 56, 58, 60, 62, 41, 42, 45, 46, 57, 58, 61, 62,
+};
+#if CONFIG_EXT_PARTITION
+static const uint8_t orders_verta_8x8[256] = {
+  0,   2,   4,   6,   16,  18,  20,  22,  64,  66,  68,  70,  80,  82,  84,
+  86,  1,   2,   5,   6,   17,  18,  21,  22,  65,  66,  69,  70,  81,  82,
+  85,  86,  8,   10,  12,  14,  24,  26,  28,  30,  72,  74,  76,  78,  88,
+  90,  92,  94,  9,   10,  13,  14,  25,  26,  29,  30,  73,  74,  77,  78,
+  89,  90,  93,  94,  32,  34,  36,  38,  48,  50,  52,  54,  96,  98,  100,
+  102, 112, 114, 116, 118, 33,  34,  37,  38,  49,  50,  53,  54,  97,  98,
+  101, 102, 113, 114, 117, 118, 40,  42,  44,  46,  56,  58,  60,  62,  104,
+  106, 108, 110, 120, 122, 124, 126, 41,  42,  45,  46,  57,  58,  61,  62,
+  105, 106, 109, 110, 121, 122, 125, 126, 128, 130, 132, 134, 144, 146, 148,
+  150, 192, 194, 196, 198, 208, 210, 212, 214, 129, 130, 133, 134, 145, 146,
+  149, 150, 193, 194, 197, 198, 209, 210, 213, 214, 136, 138, 140, 142, 152,
+  154, 156, 158, 200, 202, 204, 206, 216, 218, 220, 222, 137, 138, 141, 142,
+  153, 154, 157, 158, 201, 202, 205, 206, 217, 218, 221, 222, 160, 162, 164,
+  166, 176, 178, 180, 182, 224, 226, 228, 230, 240, 242, 244, 246, 161, 162,
+  165, 166, 177, 178, 181, 182, 225, 226, 229, 230, 241, 242, 245, 246, 168,
+  170, 172, 174, 184, 186, 188, 190, 232, 234, 236, 238, 248, 250, 252, 254,
+  169, 170, 173, 174, 185, 186, 189, 190, 233, 234, 237, 238, 249, 250, 253,
+  254,
+};
+
+/* clang-format off */
+static const uint8_t *const orders_verta[BLOCK_SIZES] = {
+  //                                  4X4
+                                      orders_verta_8x8,
+  // 4X8,           8X4,              8X8
+  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
+  // 8X16,          16X8,             16X16
+  orders_8x16,      orders_16x8,      orders_verta_16x16,
+  // 16X32,         32X16,            32X32
+  orders_16x32,     orders_32x16,     orders_verta_32x32,
+  // 32X64,         64X32,            64X64
+  orders_32x64,     orders_64x32,     orders_verta_64x64,
+  // 64x128,        128x64,           128x128
+  orders_64x128,    orders_128x64,    orders_128x128
+};
+/* clang-format on */
+#else
+/* clang-format off */
+static const uint8_t *const orders_verta[BLOCK_SIZES] = {
+  //                                      4X4
+                                          orders_verta_16x16,
+  // 4X8,             8X4,                8X8
+  orders_verta_16x16, orders_verta_16x16, orders_verta_16x16,
+  // 8X16,            16X8,               16X16
+  orders_16x32,       orders_32x16,       orders_verta_32x32,
+  // 16X32,           32X16,              32X32
+  orders_32x64,       orders_64x32,       orders_verta_64x64,
+  // 32X64,           64X32,              64X64
+  orders_64x128,      orders_128x64,      orders_128x128
+};
+/* clang-format on */
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static int av1_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
-                         int right_available, TX_SIZE txsz, int y, int x,
-                         int ss_x) {
-  if (!right_available) return 0;
+                         int right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_TYPE partition,
+#endif
+                         TX_SIZE txsz, int y, int x, int ss_x) {
+  const int wl = mi_width_log2_lookup[bsize];
+  const int w = AOMMAX(num_4x4_blocks_wide_lookup[bsize] >> ss_x, 1);
+  const int step = tx_size_wide_unit[txsz];
 
   // TODO(bshacklett, huisu): Currently the RD loop traverses 4X8 blocks in
   // inverted N order while in the bitstream the subblocks are stored in Z
@@ -91,32 +238,38 @@
   // blocks in inverted N order, and then update this function appropriately.
   if (bsize == BLOCK_4X8 && y == 1) return 0;
 
+  if (!right_available) return 0;
+
+  // Handle block size 4x8 and 4x4
+  if (ss_x == 0 && num_4x4_blocks_wide_lookup[bsize] < 2 && x == 0) return 1;
+
   if (y == 0) {
-    const int wl = mi_width_log2_lookup[bsize];
     const int hl = mi_height_log2_lookup[bsize];
-    const int w = 1 << (wl + 1 - ss_x);
-    const int step = tx_size_1d_in_unit[txsz];
-    const uint8_t *order = orders[bsize];
+    const uint8_t *order;
     int my_order, tr_order;
+#if CONFIG_EXT_PARTITION_TYPES
+    if (partition == PARTITION_VERT_A)
+      order = orders_verta[bsize];
+    else
+#endif  // CONFIG_EXT_PARTITION_TYPES
+      order = orders[bsize];
 
     if (x + step < w) return 1;
 
-    mi_row = (mi_row & 7) >> hl;
-    mi_col = (mi_col & 7) >> wl;
+    mi_row = (mi_row & MAX_MIB_MASK) >> hl;
+    mi_col = (mi_col & MAX_MIB_MASK) >> wl;
 
+    // If top row of coding unit
     if (mi_row == 0) return 1;
 
-    if (((mi_col + 1) << wl) >= 8) return 0;
+    // If rightmost column of coding unit
+    if (((mi_col + 1) << wl) >= MAX_MIB_SIZE) return 0;
 
-    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
-    tr_order = order[((mi_row - 1) << (3 - wl)) + mi_col + 1];
+    my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
+    tr_order = order[((mi_row - 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 1];
 
     return my_order > tr_order;
   } else {
-    const int wl = mi_width_log2_lookup[bsize];
-    const int w = 1 << (wl + 1 - ss_x);
-    const int step = tx_size_1d_in_unit[txsz];
-
     return x + step < w;
   }
 }
@@ -130,25 +283,27 @@
     const int wl = mi_width_log2_lookup[bsize];
     const int hl = mi_height_log2_lookup[bsize];
     const int h = 1 << (hl + 1 - ss_y);
-    const int step = tx_size_1d_in_unit[txsz];
+    const int step = tx_size_wide_unit[txsz];
     const uint8_t *order = orders[bsize];
     int my_order, bl_order;
 
-    mi_row = (mi_row & 7) >> hl;
-    mi_col = (mi_col & 7) >> wl;
-
-    if (mi_col == 0)
-      return bottom_available &&
-             (mi_row << (hl + !ss_y)) + y + step < (8 << !ss_y);
-
-    if (((mi_row + 1) << hl) >= 8) return 0;
+    // Handle block size 8x4 and 4x4
+    if (ss_y == 0 && num_4x4_blocks_high_lookup[bsize] < 2 && y == 0) return 1;
 
     if (y + step < h) return 1;
 
-    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
-    bl_order = order[((mi_row + 1) << (3 - wl)) + mi_col - 1];
+    mi_row = (mi_row & MAX_MIB_MASK) >> hl;
+    mi_col = (mi_col & MAX_MIB_MASK) >> wl;
 
-    return bl_order < my_order && bottom_available;
+    if (mi_col == 0)
+      return (mi_row << (hl + !ss_y)) + y + step < (MAX_MIB_SIZE << !ss_y);
+
+    if (((mi_row + 1) << hl) >= MAX_MIB_SIZE) return 0;
+
+    my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
+    bl_order = order[((mi_row + 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col - 1];
+
+    return bl_order < my_order;
   }
 }
 
@@ -162,8 +317,8 @@
 typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
                                    const uint16_t *above, const uint16_t *left,
                                    int bd);
-static intra_high_pred_fn pred_high[INTRA_MODES][4];
-static intra_high_pred_fn dc_pred_high[2][2][4];
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES];
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 static void av1_init_intra_predictors_internal(void) {
@@ -184,7 +339,12 @@
   INIT_ALL_SIZES(pred[D117_PRED], d117);
   INIT_ALL_SIZES(pred[D135_PRED], d135);
   INIT_ALL_SIZES(pred[D153_PRED], d153);
+
+#if CONFIG_ALT_INTRA
+  INIT_ALL_SIZES(pred[TM_PRED], paeth);
+#else
   INIT_ALL_SIZES(pred[TM_PRED], tm);
+#endif  // CONFIG_ALT_INTRA
 
   INIT_ALL_SIZES(dc_pred[0][0], dc_128);
   INIT_ALL_SIZES(dc_pred[0][1], dc_top);
@@ -200,7 +360,12 @@
   INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
   INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
   INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
+
+#if CONFIG_ALT_INTRA
+  INIT_ALL_SIZES(pred_high[TM_PRED], highbd_paeth);
+#else
   INIT_ALL_SIZES(pred_high[TM_PRED], highbd_tm);
+#endif  // CONFIG_ALT_INTRA
 
   INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
   INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
@@ -212,20 +377,39 @@
 }
 
 #if CONFIG_EXT_INTRA
-const int16_t dr_intra_derivative[90] = {
-  1,    14666, 7330, 4884, 3660, 2926, 2435, 2084, 1821, 1616, 1451, 1317, 1204,
-  1108, 1026,  955,  892,  837,  787,  743,  703,  666,  633,  603,  574,  548,
-  524,  502,   481,  461,  443,  426,  409,  394,  379,  365,  352,  339,  327,
-  316,  305,   294,  284,  274,  265,  256,  247,  238,  230,  222,  214,  207,
-  200,  192,   185,  179,  172,  166,  159,  153,  147,  141,  136,  130,  124,
-  119,  113,   108,  103,  98,   93,   88,   83,   78,   73,   68,   63,   59,
-  54,   49,    45,   40,   35,   31,   26,   22,   17,   13,   8,    4,
-};
+static int intra_subpel_interp(int base, int shift, const uint8_t *ref,
+                               int ref_start_idx, int ref_end_idx,
+                               INTRA_FILTER filter_type) {
+  int val, k, idx, filter_idx = 0;
+  const int16_t *filter = NULL;
+
+  if (filter_type == INTRA_FILTER_LINEAR) {
+    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
+    val = ROUND_POWER_OF_TWO(val, 8);
+  } else {
+    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+    filter = av1_intra_filter_kernels[filter_type][filter_idx];
+
+    if (filter_idx < (1 << SUBPEL_BITS)) {
+      val = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) {
+        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
+        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
+        val += ref[idx] * filter[k];
+      }
+      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
+    } else {
+      val = ref[base + 1];
+    }
+  }
+
+  return val;
+}
 
 // Directional prediction, zone 1: 0 < angle < 90
 static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
-                             const uint8_t *const above,
-                             const uint8_t *const left, int dx, int dy) {
+                             const uint8_t *above, const uint8_t *left, int dx,
+                             int dy, INTRA_FILTER filter_type) {
   int r, c, x, base, shift, val;
 
   (void)left;
@@ -233,6 +417,66 @@
   assert(dy == 1);
   assert(dx < 0);
 
+  if (filter_type != INTRA_FILTER_LINEAR) {
+    const int pad_size = SUBPEL_TAPS >> 1;
+    int len;
+    DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][MAX_SB_SIZE]);
+    DECLARE_ALIGNED(16, uint8_t, src[MAX_SB_SIZE + SUBPEL_TAPS]);
+    uint8_t flags[SUBPEL_SHIFTS];
+
+    memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+    memset(src, above[0], pad_size * sizeof(above[0]));
+    memcpy(src + pad_size, above, 2 * bs * sizeof(above[0]));
+    memset(src + pad_size + 2 * bs, above[2 * bs - 1],
+           pad_size * sizeof(above[0]));
+    flags[0] = 1;
+    x = -dx;
+    for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
+      base = x >> 8;
+      shift = x & 0xFF;
+      shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+      if (shift == SUBPEL_SHIFTS) {
+        base += 1;
+        shift = 0;
+      }
+      len = AOMMIN(bs, 2 * bs - 1 - base);
+      if (len <= 0) {
+        int i;
+        for (i = r; i < bs; ++i) {
+          memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
+          dst += stride;
+        }
+        return;
+      }
+
+      if (len <= (bs >> 1) && !flags[shift]) {
+        base = x >> 8;
+        shift = x & 0xFF;
+        for (c = 0; c < len; ++c) {
+          val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
+                                    filter_type);
+          dst[c] = clip_pixel(val);
+          ++base;
+        }
+      } else {
+        if (!flags[shift]) {
+          const int16_t *filter = av1_intra_filter_kernels[filter_type][shift];
+          aom_convolve8_horiz(src + pad_size, 2 * bs, buf[shift], 2 * bs,
+                              filter, 16, NULL, 16, 2 * bs,
+                              2 * bs < 16 ? 2 : 1);
+          flags[shift] = 1;
+        }
+        memcpy(dst, shift == 0 ? src + pad_size + base : &buf[shift][base],
+               len * sizeof(dst[0]));
+      }
+
+      if (len < bs)
+        memset(dst + len, above[2 * bs - 1], (bs - len) * sizeof(dst[0]));
+    }
+    return;
+  }
+
+  // For linear filter, C code is faster.
   x = -dx;
   for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
     base = x >> 8;
@@ -261,8 +505,8 @@
 
 // Directional prediction, zone 2: 90 < angle < 180
 static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs,
-                             const uint8_t *const above,
-                             const uint8_t *const left, int dx, int dy) {
+                             const uint8_t *above, const uint8_t *left, int dx,
+                             int dy, INTRA_FILTER filter_type) {
   int r, c, x, y, shift1, shift2, val, base1, base2;
 
   assert(dx > 0);
@@ -275,14 +519,14 @@
     for (c = 0; c < bs; ++c, ++base1, y -= dy) {
       if (base1 >= -1) {
         shift1 = x & 0xFF;
-        val = above[base1] * (256 - shift1) + above[base1 + 1] * shift1;
-        val = ROUND_POWER_OF_TWO(val, 8);
+        val =
+            intra_subpel_interp(base1, shift1, above, -1, bs - 1, filter_type);
       } else {
         base2 = y >> 8;
         if (base2 >= 0) {
           shift2 = y & 0xFF;
-          val = left[base2] * (256 - shift2) + left[base2 + 1] * shift2;
-          val = ROUND_POWER_OF_TWO(val, 8);
+          val =
+              intra_subpel_interp(base2, shift2, left, 0, bs - 1, filter_type);
         } else {
           val = left[0];
         }
@@ -294,15 +538,86 @@
 
 // Directional prediction, zone 3: 180 < angle < 270
 static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
-                             const uint8_t *const above,
-                             const uint8_t *const left, int dx, int dy) {
+                             const uint8_t *above, const uint8_t *left, int dx,
+                             int dy, INTRA_FILTER filter_type) {
   int r, c, y, base, shift, val;
 
   (void)above;
   (void)dx;
+
   assert(dx == 1);
   assert(dy < 0);
 
+  if (filter_type != INTRA_FILTER_LINEAR) {
+    const int pad_size = SUBPEL_TAPS >> 1;
+    int len, i;
+    DECLARE_ALIGNED(16, uint8_t, buf[MAX_SB_SIZE][4 * SUBPEL_SHIFTS]);
+    DECLARE_ALIGNED(16, uint8_t, src[(MAX_SB_SIZE + SUBPEL_TAPS) * 4]);
+    uint8_t flags[SUBPEL_SHIFTS];
+
+    memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+    for (i = 0; i < pad_size; ++i) src[4 * i] = left[0];
+    for (i = 0; i < 2 * bs; ++i) src[4 * (i + pad_size)] = left[i];
+    for (i = 0; i < pad_size; ++i)
+      src[4 * (i + 2 * bs + pad_size)] = left[2 * bs - 1];
+    flags[0] = 1;
+    y = -dy;
+    for (c = 0; c < bs; ++c, y -= dy) {
+      base = y >> 8;
+      shift = y & 0xFF;
+      shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+      if (shift == SUBPEL_SHIFTS) {
+        base += 1;
+        shift = 0;
+      }
+      len = AOMMIN(bs, 2 * bs - 1 - base);
+
+      if (len <= 0) {
+        for (r = 0; r < bs; ++r) {
+          dst[r * stride + c] = left[2 * bs - 1];
+        }
+        continue;
+      }
+
+      if (len <= (bs >> 1) && !flags[shift]) {
+        base = y >> 8;
+        shift = y & 0xFF;
+        for (r = 0; r < len; ++r) {
+          val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
+                                    filter_type);
+          dst[r * stride + c] = clip_pixel(val);
+          ++base;
+        }
+      } else {
+        if (!flags[shift]) {
+          const int16_t *filter = av1_intra_filter_kernels[filter_type][shift];
+          aom_convolve8_vert(src + 4 * pad_size, 4, buf[0] + 4 * shift,
+                             4 * SUBPEL_SHIFTS, NULL, 16, filter, 16,
+                             2 * bs < 16 ? 4 : 4, 2 * bs);
+          flags[shift] = 1;
+        }
+
+        if (shift == 0) {
+          for (r = 0; r < len; ++r) {
+            dst[r * stride + c] = left[r + base];
+          }
+        } else {
+          for (r = 0; r < len; ++r) {
+            dst[r * stride + c] = buf[r + base][4 * shift];
+          }
+        }
+      }
+
+      if (len < bs) {
+        for (r = len; r < bs; ++r) {
+          dst[r * stride + c] = left[2 * bs - 1];
+        }
+      }
+    }
+    return;
+  }
+
+  // For linear filter, C code is faster.
   y = -dy;
   for (c = 0; c < bs; ++c, y -= dy) {
     base = y >> 8;
@@ -325,7 +640,7 @@
 // If angle > 0 && angle < 90, dx = -((int)(256 / t));
 // If angle > 90 && angle < 180, dx = (int)(256 / t);
 // If angle > 180 && angle < 270, dx = 1;
-static inline int get_dx(int angle) {
+static INLINE int get_dx(int angle) {
   if (angle > 0 && angle < 90) {
     return -dr_intra_derivative[angle];
   } else if (angle > 90 && angle < 180) {
@@ -340,7 +655,7 @@
 // If angle > 0 && angle < 90, dy = 1;
 // If angle > 90 && angle < 180, dy = (int)(256 * t);
 // If angle > 180 && angle < 270, dy = -((int)(256 * t));
-static inline int get_dy(int angle) {
+static INLINE int get_dy(int angle) {
   if (angle > 90 && angle < 180) {
     return dr_intra_derivative[angle - 90];
   } else if (angle > 180 && angle < 270) {
@@ -352,244 +667,608 @@
 }
 
 static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                         const uint8_t *const above, const uint8_t *const left,
-                         int angle) {
+                         const uint8_t *above, const uint8_t *left, int angle,
+                         INTRA_FILTER filter_type) {
   const int dx = get_dx(angle);
   const int dy = get_dy(angle);
-  const int bs = tx_size_1d[tx_size];
-
+  const int bs = tx_size_wide[tx_size];
   assert(angle > 0 && angle < 270);
-  switch (angle) {
-    case 90: pred[V_PRED][tx_size](dst, stride, above, left); return;
-    case 180: pred[H_PRED][tx_size](dst, stride, above, left); return;
-    case 45: pred[D45_PRED][tx_size](dst, stride, above, left); return;
-    case 135: pred[D135_PRED][tx_size](dst, stride, above, left); return;
-    case 117: pred[D117_PRED][tx_size](dst, stride, above, left); return;
-    case 153: pred[D153_PRED][tx_size](dst, stride, above, left); return;
-    case 207: pred[D207_PRED][tx_size](dst, stride, above, left); return;
-    case 63: pred[D63_PRED][tx_size](dst, stride, above, left); return;
-    default: break;
-  }
 
   if (angle > 0 && angle < 90) {
-    dr_prediction_z1(dst, stride, bs, above, left, dx, dy);
+    dr_prediction_z1(dst, stride, bs, above, left, dx, dy, filter_type);
   } else if (angle > 90 && angle < 180) {
-    dr_prediction_z2(dst, stride, bs, above, left, dx, dy);
+    dr_prediction_z2(dst, stride, bs, above, left, dx, dy, filter_type);
   } else if (angle > 180 && angle < 270) {
-    dr_prediction_z3(dst, stride, bs, above, left, dx, dy);
-  } else {
-    assert(0);
+    dr_prediction_z3(dst, stride, bs, above, left, dx, dy, filter_type);
+  } else if (angle == 90) {
+    pred[V_PRED][tx_size](dst, stride, above, left);
+  } else if (angle == 180) {
+    pred[H_PRED][tx_size](dst, stride, above, left);
   }
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
+static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref,
+                                      int ref_start_idx, int ref_end_idx,
+                                      INTRA_FILTER filter_type) {
+  int val, k, idx, filter_idx = 0;
+  const int16_t *filter = NULL;
+
+  if (filter_type == INTRA_FILTER_LINEAR) {
+    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
+    val = ROUND_POWER_OF_TWO(val, 8);
+  } else {
+    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+    filter = av1_intra_filter_kernels[filter_type][filter_idx];
+
+    if (filter_idx < (1 << SUBPEL_BITS)) {
+      val = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) {
+        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
+        idx = AOMMAX(AOMMIN(idx, ref_end_idx), ref_start_idx);
+        val += ref[idx] * filter[k];
+      }
+      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
+    } else {
+      val = ref[base + 1];
+    }
+  }
+
+  return val;
+}
+
 // Directional prediction, zone 1: 0 < angle < 90
-static void dr_prediction_z1_high(uint16_t *dst, ptrdiff_t stride, int bs,
-                                  const uint16_t *const above,
-                                  const uint16_t *const left, int dx, int dy,
-                                  int bd) {
-  int r, c, x, base, shift, val;
+static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd,
+                                    INTRA_FILTER filter_type) {
+  int r, c, x, y, base, shift, val;
 
   (void)left;
   (void)dy;
   assert(dy == 1);
   assert(dx < 0);
 
-  x = -dx;
-  for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
-    base = x >> 8;
-    shift = x & 0xFF;
-
-    if (base >= 2 * bs - 1) {
-      int i;
-      for (i = r; i < bs; ++i) {
-        memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
-        dst += stride;
-      }
-      return;
-    }
-
-    for (c = 0; c < bs; ++c, ++base) {
+  for (r = 0; r < bs; ++r) {
+    y = r + 1;
+    for (c = 0; c < bs; ++c) {
+      x = (c << 8) - y * dx;
+      base = x >> 8;
+      shift = x & 0xFF;
       if (base < 2 * bs - 1) {
-        val = above[base] * (256 - shift) + above[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
+        val = highbd_intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
+                                         filter_type);
         dst[c] = clip_pixel_highbd(val, bd);
       } else {
         dst[c] = above[2 * bs - 1];
       }
     }
+    dst += stride;
   }
 }
 
 // Directional prediction, zone 2: 90 < angle < 180
-static void dr_prediction_z2_high(uint16_t *dst, ptrdiff_t stride, int bs,
-                                  const uint16_t *const above,
-                                  const uint16_t *const left, int dx, int dy,
-                                  int bd) {
-  int r, c, x, y, shift1, shift2, val, base1, base2;
+static void highbd_dr_prediction_z2(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd,
+                                    INTRA_FILTER filter_type) {
+  int r, c, x, y, shift, val, base;
 
   assert(dx > 0);
   assert(dy > 0);
 
-  x = -dx;
-  for (r = 0; r < bs; ++r, x -= dx, dst += stride) {
-    base1 = x >> 8;
-    y = (r << 8) - dy;
-    for (c = 0; c < bs; ++c, ++base1, y -= dy) {
-      if (base1 >= -1) {
-        shift1 = x & 0xFF;
-        val = above[base1] * (256 - shift1) + above[base1 + 1] * shift1;
-        val = ROUND_POWER_OF_TWO(val, 8);
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      y = r + 1;
+      x = (c << 8) - y * dx;
+      base = x >> 8;
+      if (base >= -1) {
+        shift = x & 0xFF;
+        val = highbd_intra_subpel_interp(base, shift, above, -1, bs - 1,
+                                         filter_type);
       } else {
-        base2 = y >> 8;
-        if (base2 >= 0) {
-          shift2 = y & 0xFF;
-          val = left[base2] * (256 - shift2) + left[base2 + 1] * shift2;
-          val = ROUND_POWER_OF_TWO(val, 8);
+        x = c + 1;
+        y = (r << 8) - x * dy;
+        base = y >> 8;
+        if (base >= 0) {
+          shift = y & 0xFF;
+          val = highbd_intra_subpel_interp(base, shift, left, 0, bs - 1,
+                                           filter_type);
         } else {
           val = left[0];
         }
       }
       dst[c] = clip_pixel_highbd(val, bd);
     }
+    dst += stride;
   }
 }
 
 // Directional prediction, zone 3: 180 < angle < 270
-static void dr_prediction_z3_high(uint16_t *dst, ptrdiff_t stride, int bs,
-                                  const uint16_t *const above,
-                                  const uint16_t *const left, int dx, int dy,
-                                  int bd) {
-  int r, c, y, base, shift, val;
+static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd,
+                                    INTRA_FILTER filter_type) {
+  int r, c, x, y, base, shift, val;
 
   (void)above;
   (void)dx;
   assert(dx == 1);
   assert(dy < 0);
 
-  y = -dy;
-  for (c = 0; c < bs; ++c, y -= dy) {
-    base = y >> 8;
-    shift = y & 0xFF;
-
-    for (r = 0; r < bs; ++r, ++base) {
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      x = c + 1;
+      y = (r << 8) - x * dy;
+      base = y >> 8;
+      shift = y & 0xFF;
       if (base < 2 * bs - 1) {
-        val = left[base] * (256 - shift) + left[base + 1] * shift;
-        val = ROUND_POWER_OF_TWO(val, 8);
-        dst[r * stride + c] = clip_pixel_highbd(val, bd);
+        val = highbd_intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
+                                         filter_type);
+        dst[c] = clip_pixel_highbd(val, bd);
       } else {
-        for (; r < bs; ++r) dst[r * stride + c] = left[2 * bs - 1];
-        break;
+        dst[c] = left[2 * bs - 1];
       }
     }
+    dst += stride;
   }
 }
 
-static void dr_predictor_high(uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                              const uint16_t *above, const uint16_t *left,
-                              int angle, int bd) {
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)left;
+  (void)bd;
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)bd;
+  for (r = 0; r < bs; r++) {
+    aom_memset16(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                const uint16_t *above, const uint16_t *left,
+                                int angle, int bd, INTRA_FILTER filter) {
   const int dx = get_dx(angle);
   const int dy = get_dy(angle);
-  const int bs = tx_size_1d[tx_size];
-
   assert(angle > 0 && angle < 270);
-  switch (angle) {
-    case 90: pred_high[V_PRED][tx_size](dst, stride, above, left, bd); return;
-    case 180: pred_high[H_PRED][tx_size](dst, stride, above, left, bd); return;
-    case 45: pred_high[D45_PRED][tx_size](dst, stride, above, left, bd); return;
-    case 135:
-      pred_high[D135_PRED][tx_size](dst, stride, above, left, bd);
-      return;
-    case 117:
-      pred_high[D117_PRED][tx_size](dst, stride, above, left, bd);
-      return;
-    case 153:
-      pred_high[D153_PRED][tx_size](dst, stride, above, left, bd);
-      return;
-    case 207:
-      pred_high[D207_PRED][tx_size](dst, stride, above, left, bd);
-      return;
-    case 63: pred_high[D63_PRED][tx_size](dst, stride, above, left, bd); return;
-    default: break;
-  }
 
   if (angle > 0 && angle < 90) {
-    dr_prediction_z1_high(dst, stride, bs, above, left, dx, dy, bd);
+    highbd_dr_prediction_z1(dst, stride, bs, above, left, dx, dy, bd, filter);
   } else if (angle > 90 && angle < 180) {
-    dr_prediction_z2_high(dst, stride, bs, above, left, dx, dy, bd);
+    highbd_dr_prediction_z2(dst, stride, bs, above, left, dx, dy, bd, filter);
   } else if (angle > 180 && angle < 270) {
-    dr_prediction_z3_high(dst, stride, bs, above, left, dx, dy, bd);
-  } else {
-    assert(0);
+    highbd_dr_prediction_z3(dst, stride, bs, above, left, dx, dy, bd, filter);
+  } else if (angle == 90) {
+    highbd_v_predictor(dst, stride, bs, above, left, bd);
+  } else if (angle == 180) {
+    highbd_h_predictor(dst, stride, bs, above, left, bd);
   }
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_EXT_INTRA
 
+#if CONFIG_FILTER_INTRA
+int av1_filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
+  {
+      { 735, 881, -537, -54 },
+      { 1005, 519, -488, -11 },
+      { 383, 990, -343, -6 },
+      { 442, 805, -542, 319 },
+      { 658, 616, -133, -116 },
+      { 875, 442, -141, -151 },
+      { 386, 741, -23, -80 },
+      { 390, 1027, -446, 51 },
+      { 679, 606, -523, 262 },
+      { 903, 922, -778, -23 },
+  },
+  {
+      { 648, 803, -444, 16 },
+      { 972, 620, -576, 7 },
+      { 561, 967, -499, -5 },
+      { 585, 762, -468, 144 },
+      { 596, 619, -182, -9 },
+      { 895, 459, -176, -153 },
+      { 557, 722, -126, -129 },
+      { 601, 839, -523, 105 },
+      { 562, 709, -499, 251 },
+      { 803, 872, -695, 43 },
+  },
+  {
+      { 423, 728, -347, 111 },
+      { 963, 685, -665, 23 },
+      { 281, 1024, -480, 216 },
+      { 640, 596, -437, 78 },
+      { 429, 669, -259, 99 },
+      { 740, 646, -415, 23 },
+      { 568, 771, -346, 40 },
+      { 404, 833, -486, 209 },
+      { 398, 712, -423, 307 },
+      { 939, 935, -887, 17 },
+  },
+  {
+      { 477, 737, -393, 150 },
+      { 881, 630, -546, 67 },
+      { 506, 984, -443, -20 },
+      { 114, 459, -270, 528 },
+      { 433, 528, 14, 3 },
+      { 837, 470, -301, -30 },
+      { 181, 777, 89, -107 },
+      { -29, 716, -232, 259 },
+      { 589, 646, -495, 255 },
+      { 740, 884, -728, 77 },
+  },
+};
+
+static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
+                                         const uint8_t *above,
+                                         const uint8_t *left, int mode) {
+  int k, r, c;
+  int buffer[33][65];
+  int mean, ipred;
+  const TX_SIZE tx_size =
+      (bs == 32) ? TX_32X32
+                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  const int c0 = av1_filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = av1_filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = av1_filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = av1_filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r) buffer[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
+              c2 * buffer[r - 1][c - 1] + c3 * buffer[r - 1][c + 1];
+      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = buffer[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel(ipred);
+    }
+    dst += stride;
+  }
+}
+
+void av1_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED);
+}
+
+void av1_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                              const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED);
+}
+
+void av1_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                              const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED);
+}
+
+void av1_d45_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED);
+}
+
+void av1_d135_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED);
+}
+
+void av1_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED);
+}
+
+void av1_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED);
+}
+
+void av1_d207_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED);
+}
+
+void av1_d63_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED);
+}
+
+void av1_tm_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED);
+}
+
+static void filter_intra_predictors(int mode, uint8_t *dst, ptrdiff_t stride,
+                                    int bs, const uint8_t *above,
+                                    const uint8_t *left) {
+  switch (mode) {
+    case DC_PRED: av1_dc_filter_predictor(dst, stride, bs, above, left); break;
+    case V_PRED: av1_v_filter_predictor(dst, stride, bs, above, left); break;
+    case H_PRED: av1_h_filter_predictor(dst, stride, bs, above, left); break;
+    case D45_PRED:
+      av1_d45_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case D135_PRED:
+      av1_d135_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case D117_PRED:
+      av1_d117_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case D153_PRED:
+      av1_d153_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case D207_PRED:
+      av1_d207_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case D63_PRED:
+      av1_d63_filter_predictor(dst, stride, bs, above, left);
+      break;
+    case TM_PRED: av1_tm_filter_predictor(dst, stride, bs, above, left); break;
+    default: assert(0);
+  }
+}
 #if CONFIG_AOM_HIGHBITDEPTH
-static void build_intra_predictors_high(const MACROBLOCKD *xd,
-                                        const uint8_t *ref8, int ref_stride,
-                                        uint8_t *dst8, int dst_stride,
-                                        PREDICTION_MODE mode, TX_SIZE tx_size,
-                                        int n_top_px, int n_topright_px,
-                                        int n_left_px, int n_bottomleft_px,
-                                        int x, int y, int plane, int bd) {
+static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
+                                                int bs, const uint16_t *above,
+                                                const uint16_t *left, int mode,
+                                                int bd) {
+  int k, r, c;
+  int preds[33][65];
+  int mean, ipred;
+  const TX_SIZE tx_size =
+      (bs == 32) ? TX_32X32
+                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  const int c0 = av1_filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = av1_filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = av1_filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = av1_filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r) preds[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
+              c2 * preds[r - 1][c - 1] + c3 * preds[r - 1][c + 1];
+      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = preds[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+
+void av1_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED,
+                                      bd);
+}
+
+void av1_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED, bd);
+}
+
+void av1_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED, bd);
+}
+
+void av1_highbd_d45_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED,
+                                      bd);
+}
+
+void av1_highbd_d135_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED,
+                                      bd);
+}
+
+void av1_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED,
+                                      bd);
+}
+
+void av1_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED,
+                                      bd);
+}
+
+void av1_highbd_d207_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED,
+                                      bd);
+}
+
+void av1_highbd_d63_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED,
+                                      bd);
+}
+
+void av1_highbd_tm_filter_predictor_c(uint16_t *dst, ptrdiff_t stride, int bs,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED,
+                                      bd);
+}
+
+static void highbd_filter_intra_predictors(int mode, uint16_t *dst,
+                                           ptrdiff_t stride, int bs,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  switch (mode) {
+    case DC_PRED:
+      av1_highbd_dc_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case V_PRED:
+      av1_highbd_v_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case H_PRED:
+      av1_highbd_h_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D45_PRED:
+      av1_highbd_d45_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D135_PRED:
+      av1_highbd_d135_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D117_PRED:
+      av1_highbd_d117_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D153_PRED:
+      av1_highbd_d153_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D207_PRED:
+      av1_highbd_d207_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case D63_PRED:
+      av1_highbd_d63_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    case TM_PRED:
+      av1_highbd_tm_filter_predictor(dst, stride, bs, above, left, bd);
+      break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static void build_intra_predictors_high(
+    const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
+    int dst_stride, PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px,
+    int n_topright_px, int n_left_px, int n_bottomleft_px, int plane) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
-  DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
+  DECLARE_ALIGNED(16, uint16_t, left_col[MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_SB_SIZE + 16]);
   uint16_t *above_row = above_data + 16;
   const uint16_t *const_above_row = above_row;
-  const int bs = tx_size_1d[tx_size];
-  const uint16_t *above_ref = ref - ref_stride;
-  const int base = 128 << (bd - 8);
-#if CONFIG_EXT_INTRA
+  const int bs = tx_size_wide[tx_size];
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+  const uint16_t *above_ref = ref - ref_stride;
+#if CONFIG_EXT_INTRA
   int p_angle = 0;
-  const TX_SIZE max_tx_size = max_txsize_lookup[mbmi->sb_type];
-  const int angle_step =
-      plane ? ANGLE_STEP_UV : av1_angle_step_y[max_tx_size][mbmi->mode];
-  const int use_directional_mode =
-      is_directional_mode(mode) && mbmi->sb_type >= BLOCK_8X8;
-#else
-  const int need_left = extend_modes[mode] & NEED_LEFT;
-  const int need_above = extend_modes[mode] & NEED_ABOVE;
+  const int is_dr_mode = mode != DC_PRED && mode != TM_PRED &&
+                         xd->mi[0]->mbmi.sb_type >= BLOCK_8X8;
 #endif  // CONFIG_EXT_INTRA
-  // 127 127 127 .. 127 127 127 127 127 127
-  // 129  A   B  ..  Y   Z
-  // 129  C   D  ..  W   X
-  // 129  E   F  ..  U   V
-  // 129  G   H  ..  S   T   T   T   T   T
-
-  (void)x;
-  (void)y;
-  (void)plane;
+#if CONFIG_FILTER_INTRA
+  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &xd->mi[0]->mbmi.filter_intra_mode_info;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      filter_intra_mode_info->filter_intra_mode[plane != 0];
+#endif  // CONFIG_FILTER_INTRA
+  int base = 128 << (xd->bd - 8);
+// 127 127 127 .. 127 127 127 127 127 127
+// 129  A   B  ..  Y   Z
+// 129  C   D  ..  W   X
+// 129  E   F  ..  U   V
+// 129  G   H  ..  S   T   T   T   T   T
 
 #if CONFIG_EXT_INTRA
-  if (use_directional_mode) {
+  if (is_dr_mode) {
     p_angle = mode_to_angle_map[mode] +
-              mbmi->intra_angle_delta[plane != 0] * angle_step;
+              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
     if (p_angle <= 90)
-      need_above = 1, need_left = 0;
+      need_above = 1, need_left = 0, need_above_left = 1;
     else if (p_angle < 180)
-      need_above = 1, need_left = 1;
+      need_above = 1, need_left = 1, need_above_left = 1;
     else
-      need_above = 0, need_left = 1;
+      need_above = 0, need_left = 1, need_above_left = 1;
   }
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+    need_left = need_above = need_above_left = 1;
+#endif  // CONFIG_FILTER_INTRA
+
+  (void)plane;
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    const int val = (n_left_px == 0) ? base + 1 : base - 1;
+    for (i = 0; i < bs; ++i) {
+      aom_memset16(dst, val, bs);
+      dst += dst_stride;
+    }
+    return;
+  }
 
   // NEED_LEFT
   if (need_left) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_bottom = 0;
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_EXT_INTRA
-    const int need_bottom = use_directional_mode
-                                ? (p_angle > 180)
-                                : (!!(extend_modes[mode] & NEED_BOTTOMLEFT));
+    if (is_dr_mode) need_bottom = p_angle > 180;
+#endif  // CONFIG_EXT_INTRA
 #else
     const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#endif  // CONFIG_EXT_INTRA
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
@@ -607,13 +1286,18 @@
 
   // NEED_ABOVE
   if (need_above) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_right = 1;
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_EXT_INTRA
-    const int need_right = use_directional_mode
-                               ? (p_angle < 90)
-                               : (!!(extend_modes[mode] & NEED_ABOVERIGHT));
+    if (is_dr_mode) need_right = p_angle < 90;
+#endif  // CONFIG_EXT_INTRA
 #else
     const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#endif  // CONFIG_EXT_INTRA
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
       i = n_top_px;
@@ -630,20 +1314,26 @@
     }
   }
 
-#if CONFIG_EXT_INTRA
-  above_row[-1] =
-      n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
-#else
-  if (extend_modes[mode] & NEED_ABOVELEFT) {
+  if (need_above_left) {
     above_row[-1] =
         n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
   }
-#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
+    highbd_filter_intra_predictors(filter_intra_mode, dst, dst_stride, bs,
+                                   const_above_row, left_col, xd->bd);
+    return;
+  }
+#endif  // CONFIG_FILTER_INTRA
 
 #if CONFIG_EXT_INTRA
-  if (use_directional_mode) {
-    dr_predictor_high(dst, dst_stride, tx_size, const_above_row, left_col,
-                      p_angle, bd);
+  if (is_dr_mode) {
+    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
+    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
+      filter = xd->mi[0]->mbmi.intra_filter;
+    highbd_dr_predictor(dst, dst_stride, bs, const_above_row, left_col, p_angle,
+                        xd->bd, filter);
     return;
   }
 #endif  // CONFIG_EXT_INTRA
@@ -663,65 +1353,84 @@
                                    int ref_stride, uint8_t *dst, int dst_stride,
                                    PREDICTION_MODE mode, TX_SIZE tx_size,
                                    int n_top_px, int n_topright_px,
-                                   int n_left_px, int n_bottomleft_px, int x,
-                                   int y, int plane) {
+                                   int n_left_px, int n_bottomleft_px,
+                                   int plane) {
   int i;
-  DECLARE_ALIGNED(16, uint8_t, left_col[64]);
+  DECLARE_ALIGNED(16, uint8_t, left_col[MAX_SB_SIZE]);
   const uint8_t *above_ref = ref - ref_stride;
-  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_SB_SIZE + 16]);
   uint8_t *above_row = above_data + 16;
   const uint8_t *const_above_row = above_row;
-  const int bs = tx_size_1d[tx_size];
+  const int bs = tx_size_wide[tx_size];
   int need_left = extend_modes[mode] & NEED_LEFT;
   int need_above = extend_modes[mode] & NEED_ABOVE;
+  int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
 #if CONFIG_EXT_INTRA
-  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int p_angle = 0;
-  const TX_SIZE max_tx_size = max_txsize_lookup[mbmi->sb_type];
-  const int angle_step =
-      plane ? ANGLE_STEP_UV : av1_angle_step_y[max_tx_size][mbmi->mode];
-  const int use_directional_mode =
-      is_directional_mode(mode) && mbmi->sb_type >= BLOCK_8X8;
+  const int is_dr_mode = mode != DC_PRED && mode != TM_PRED &&
+                         xd->mi[0]->mbmi.sb_type >= BLOCK_8X8;
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  const FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &xd->mi[0]->mbmi.filter_intra_mode_info;
+  const FILTER_INTRA_MODE filter_intra_mode =
+      filter_intra_mode_info->filter_intra_mode[plane != 0];
+#endif  // CONFIG_FILTER_INTRA
 
-  // 127 127 127 .. 127 127 127 127 127 127
-  // 129  A   B  ..  Y   Z
-  // 129  C   D  ..  W   X
-  // 129  E   F  ..  U   V
-  // 129  G   H  ..  S   T   T   T   T   T
-  // ..
+// 127 127 127 .. 127 127 127 127 127 127
+// 129  A   B  ..  Y   Z
+// 129  C   D  ..  W   X
+// 129  E   F  ..  U   V
+// 129  G   H  ..  S   T   T   T   T   T
+// ..
+
+#if CONFIG_EXT_INTRA
+  if (is_dr_mode) {
+    p_angle = mode_to_angle_map[mode] +
+              xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0, need_above_left = 1;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1, need_above_left = 1;
+    else
+      need_above = 0, need_left = 1, need_above_left = 1;
+  }
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+    need_left = need_above = need_above_left = 1;
+#endif  // CONFIG_FILTER_INTRA
 
   (void)xd;
-  (void)x;
-  (void)y;
   (void)plane;
   assert(n_top_px >= 0);
   assert(n_topright_px >= 0);
   assert(n_left_px >= 0);
   assert(n_bottomleft_px >= 0);
 
-#if CONFIG_EXT_INTRA
-  if (use_directional_mode) {
-    p_angle = mode_to_angle_map[mode] +
-              mbmi->intra_angle_delta[plane != 0] * angle_step;
-    if (p_angle <= 90)
-      need_above = 1, need_left = 0;
-    else if (p_angle < 180)
-      need_above = 1, need_left = 1;
-    else
-      need_above = 0, need_left = 1;
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    const int val = (n_left_px == 0) ? 129 : 127;
+    for (i = 0; i < bs; ++i) {
+      memset(dst, val, bs);
+      dst += dst_stride;
+    }
+    return;
   }
-#endif  // CONFIG_EXT_INTRA
 
   // NEED_LEFT
   if (need_left) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_bottom = 0;
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_EXT_INTRA
-    const int need_bottom = use_directional_mode
-                                ? (p_angle > 180)
-                                : (!!(extend_modes[mode] & NEED_BOTTOMLEFT));
+    if (is_dr_mode) need_bottom = p_angle > 180;
+#endif  // CONFIG_EXT_INTRA
 #else
     const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
-#endif  // CONFIG_EXT_INTRA
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++) left_col[i] = ref[i * ref_stride - 1];
@@ -739,13 +1448,18 @@
 
   // NEED_ABOVE
   if (need_above) {
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA
+    int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#if CONFIG_FILTER_INTRA
+    if (filter_intra_mode_info->use_filter_intra_mode[plane != 0])
+      need_right = 1;
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_EXT_INTRA
-    const int need_right = use_directional_mode
-                               ? (p_angle < 90)
-                               : (!!(extend_modes[mode] & NEED_ABOVERIGHT));
+    if (is_dr_mode) need_right = p_angle < 90;
+#endif  // CONFIG_EXT_INTRA
 #else
     const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
-#endif  // CONFIG_EXT_INTRA
+#endif  // CONFIG_EXT_INTRA || CONFIG_FITLER_INTRA
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px);
       i = n_top_px;
@@ -761,17 +1475,25 @@
     }
   }
 
-#if CONFIG_EXT_INTRA
-  above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
-#else
-  if (extend_modes[mode] & NEED_ABOVELEFT) {
+  if (need_above_left) {
     above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
   }
-#endif  // CONFIG_EXT_INTRA
 
+#if CONFIG_FILTER_INTRA
+  if (filter_intra_mode_info->use_filter_intra_mode[plane != 0]) {
+    filter_intra_predictors(filter_intra_mode, dst, dst_stride, bs,
+                            const_above_row, left_col);
+    return;
+  }
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_EXT_INTRA
-  if (use_directional_mode) {
-    dr_predictor(dst, dst_stride, tx_size, const_above_row, left_col, p_angle);
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
+    if (plane == 0 && av1_is_intra_filter_switchable(p_angle))
+      filter = xd->mi[0]->mbmi.intra_filter;
+    dr_predictor(dst, dst_stride, tx_size, const_above_row, left_col, p_angle,
+                 filter);
     return;
   }
 #endif  // CONFIG_EXT_INTRA
@@ -785,38 +1507,50 @@
   }
 }
 
-void av1_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
+void av1_predict_intra_block(const MACROBLOCKD *xd, int wpx, int hpx,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride, uint8_t *dst,
-                             int dst_stride, int aoff, int loff, int plane) {
-  const int have_top = loff || xd->up_available;
-  const int have_left = aoff || xd->left_available;
-  const int x = aoff * 4;
-  const int y = loff * 4;
-  const int bw = AOMMAX(2, 1 << bwl_in);
-  const int bh = AOMMAX(2, 1 << bhl_in);
-  const int mi_row = -xd->mb_to_top_edge >> 6;
-  const int mi_col = -xd->mb_to_left_edge >> 6;
+                             int dst_stride, int col_off, int row_off,
+                             int plane) {
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int txw = tx_size_wide_unit[tx_size];
+  const int txh = tx_size_high_unit[tx_size];
+  const int have_top = row_off || xd->up_available;
+  const int have_left = col_off || xd->left_available;
+  const int x = col_off * 4;
+  const int y = row_off * 4;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+  const int txwpx = 4 * txw;
+  const int txhpx = 4 * txh;
+  // Distance between the right edge of this prediction block to
+  // the frame right edge
+  const int xr =
+      (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + (wpx - x - txwpx);
+  // Distance between the bottom edge of this prediction block to
+  // the frame bottom edge
+  const int yd =
+      (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + (hpx - y - txhpx);
   const int right_available =
-      mi_col + (bw >> !pd->subsampling_x) < xd->tile.mi_col_end;
-  const int have_right = av1_has_right(bsize, mi_row, mi_col, right_available,
-                                       tx_size, loff, aoff, pd->subsampling_x);
-  const int have_bottom =
-      av1_has_bottom(bsize, mi_row, mi_col, xd->mb_to_bottom_edge > 0, tx_size,
-                     loff, aoff, pd->subsampling_y);
-  const int wpx = 4 * bw;
-  const int hpx = 4 * bh;
-  const int txpx = tx_size_1d[tx_size];
-  int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + (wpx - x - txpx);
-  int yd =
-      (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + (hpx - y - txpx);
+      (mi_col + ((col_off + txw) >> (1 - pd->subsampling_x))) <
+      xd->tile.mi_col_end;
+#if CONFIG_EXT_PARTITION_TYPES
+  const PARTITION_TYPE partition = xd->mi[0]->mbmi.partition;
+#endif
+  const int have_right =
+      av1_has_right(bsize, mi_row, mi_col, right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                    partition,
+#endif
+                    tx_size, row_off, col_off, pd->subsampling_x);
+  const int have_bottom = av1_has_bottom(bsize, mi_row, mi_col, yd > 0, tx_size,
+                                         row_off, col_off, pd->subsampling_y);
 
 #if CONFIG_PALETTE
   if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
-    const int bs = tx_size_1d[tx_size];
-    const int stride = 4 * (1 << bwl_in);
+    const int bs = tx_size_wide[tx_size];
+    const int stride = wpx;
     int r, c;
     uint8_t *map = NULL;
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -836,34 +1570,36 @@
         for (c = 0; c < bs; ++c)
           dst16[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
     } else {
-#endif  // CONFIG_AOM_HIGHBITDEPTH
       for (r = 0; r < bs; ++r)
         for (c = 0; c < bs; ++c)
-          dst[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
-#if CONFIG_AOM_HIGHBITDEPTH
+          dst[r * dst_stride + c] =
+              (uint8_t)(palette[map[(r + y) * stride + c + x]]);
     }
+#else
+    for (r = 0; r < bs; ++r)
+      for (c = 0; c < bs; ++c)
+        dst[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-
     return;
   }
 #endif  // CONFIG_PALETTE
 
 #if CONFIG_AOM_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
-                                tx_size, have_top ? AOMMIN(txpx, xr + txpx) : 0,
-                                have_top && have_right ? AOMMIN(txpx, xr) : 0,
-                                have_left ? AOMMIN(txpx, yd + txpx) : 0,
-                                have_bottom && have_left ? AOMMIN(txpx, yd) : 0,
-                                x, y, plane, xd->bd);
+    build_intra_predictors_high(
+        xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
+        have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+        have_top && have_right ? AOMMIN(txwpx, xr) : 0,
+        have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+        have_bottom && have_left ? AOMMIN(txhpx, yd) : 0, plane);
     return;
   }
 #endif
   build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
-                         have_top ? AOMMIN(txpx, xr + txpx) : 0,
-                         have_top && have_right ? AOMMIN(txpx, xr) : 0,
-                         have_left ? AOMMIN(txpx, yd + txpx) : 0,
-                         have_bottom && have_left ? AOMMIN(txpx, yd) : 0, x, y,
+                         have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+                         have_top && have_right ? AOMMIN(txwpx, xr) : 0,
+                         have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+                         have_bottom && have_left ? AOMMIN(txhpx, yd) : 0,
                          plane);
 }
 

diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index f75dbcd..23bad1c 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h

@@ -21,12 +21,31 @@
 
 void av1_init_intra_predictors(void);
 
-void av1_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
+void av1_predict_intra_block(const MACROBLOCKD *xd, int bw, int bh,
                              TX_SIZE tx_size, PREDICTION_MODE mode,
                              const uint8_t *ref, int ref_stride, uint8_t *dst,
                              int dst_stride, int aoff, int loff, int plane);
+
+#if CONFIG_EXT_INTER
+// Mapping of interintra to intra mode for use in the intra component
+static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
+  DC_PRED,   V_PRED,    H_PRED,    D45_PRED, D135_PRED,
+  D117_PRED, D153_PRED, D207_PRED, D63_PRED, TM_PRED
+};
+
+// Mapping of intra mode to the interintra mode
+static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
+  II_DC_PRED,   II_V_PRED,    II_H_PRED,    II_D45_PRED, II_D135_PRED,
+  II_D117_PRED, II_D153_PRED, II_D207_PRED, II_D63_PRED, II_TM_PRED
+};
+#endif  // CONFIG_EXT_INTER
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
+#if CONFIG_FILTER_INTRA
+#define FILTER_INTRA_PREC_BITS 10
+extern int av1_filter_intra_taps_4[TX_SIZES][INTRA_MODES][4];
+#endif  // CONFIG_FILTER_INTRA
+
 #endif  // AV1_COMMON_RECONINTRA_H_

diff --git a/av1/common/restoration.c b/av1/common/restoration.c
new file mode 100644
index 0000000..f1c4239
--- /dev/null
+++ b/av1/common/restoration.c

@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ */
+
+#include <math.h>
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/restoration.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#define BILATERAL_PARAM_PRECISION 16
+#define BILATERAL_AMP_RANGE 256
+#define BILATERAL_AMP_RANGE_SYM (2 * BILATERAL_AMP_RANGE + 1)
+
+static uint8_t bilateral_filter_coeffs_r_kf[BILATERAL_LEVELS_KF]
+                                           [BILATERAL_AMP_RANGE_SYM];
+static uint8_t bilateral_filter_coeffs_r[BILATERAL_LEVELS]
+                                        [BILATERAL_AMP_RANGE_SYM];
+static uint8_t bilateral_filter_coeffs_s_kf[BILATERAL_LEVELS_KF]
+                                           [RESTORATION_WIN][RESTORATION_WIN];
+static uint8_t bilateral_filter_coeffs_s[BILATERAL_LEVELS][RESTORATION_WIN]
+                                        [RESTORATION_WIN];
+
+typedef struct bilateral_params {
+  int sigma_x;  // spatial variance x
+  int sigma_y;  // spatial variance y
+  int sigma_r;  // range variance
+} BilateralParamsType;
+
+static BilateralParamsType bilateral_level_to_params_arr[BILATERAL_LEVELS] = {
+  //  Values are rounded to 1/16 th precision
+  { 8, 9, 30 },   { 9, 8, 30 },   { 9, 11, 32 },  { 11, 9, 32 },
+  { 14, 14, 36 }, { 18, 18, 36 }, { 24, 24, 40 }, { 32, 32, 40 },
+};
+
+static BilateralParamsType
+    bilateral_level_to_params_arr_kf[BILATERAL_LEVELS_KF] = {
+      //  Values are rounded to 1/16 th precision
+      { 8, 8, 30 },   { 9, 9, 32 },   { 10, 10, 32 }, { 12, 12, 32 },
+      { 14, 14, 32 }, { 18, 18, 36 }, { 24, 24, 40 }, { 30, 30, 44 },
+      { 36, 36, 48 }, { 42, 42, 48 }, { 48, 48, 48 }, { 48, 48, 56 },
+      { 56, 56, 48 }, { 56, 56, 56 }, { 56, 56, 64 }, { 64, 64, 48 },
+    };
+
+typedef void (*restore_func_type)(uint8_t *data8, int width, int height,
+                                  int stride, RestorationInternal *rst,
+                                  uint8_t *tmpdata8, int tmpstride);
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*restore_func_highbd_type)(uint8_t *data8, int width, int height,
+                                         int stride, RestorationInternal *rst,
+                                         uint8_t *tmpdata8, int tmpstride,
+                                         int bit_depth);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+static INLINE BilateralParamsType av1_bilateral_level_to_params(int index,
+                                                                int kf) {
+  return kf ? bilateral_level_to_params_arr_kf[index]
+            : bilateral_level_to_params_arr[index];
+}
+
+void av1_loop_restoration_precal() {
+  int i;
+  for (i = 0; i < BILATERAL_LEVELS_KF; i++) {
+    const BilateralParamsType param = av1_bilateral_level_to_params(i, 1);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / BILATERAL_PARAM_PRECISION;
+    const double sigma_x_d = (double)sigma_x / BILATERAL_PARAM_PRECISION;
+    const double sigma_y_d = (double)sigma_y / BILATERAL_PARAM_PRECISION;
+
+    uint8_t *fr = bilateral_filter_coeffs_r_kf[i] + BILATERAL_AMP_RANGE;
+    int j, x, y;
+    for (j = 0; j <= BILATERAL_AMP_RANGE; j++) {
+      fr[j] = (uint8_t)(0.5 +
+                        RESTORATION_FILT_STEP *
+                            exp(-(j * j) / (2 * sigma_r_d * sigma_r_d)));
+      fr[-j] = fr[j];
+    }
+    for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; y++) {
+      for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; x++) {
+        bilateral_filter_coeffs_s_kf[i][y + RESTORATION_HALFWIN]
+                                    [x + RESTORATION_HALFWIN] = (uint8_t)(
+                                        0.5 +
+                                        RESTORATION_FILT_STEP *
+                                            exp(-(x * x) / (2 * sigma_x_d *
+                                                            sigma_x_d) -
+                                                (y * y) / (2 * sigma_y_d *
+                                                           sigma_y_d)));
+      }
+    }
+  }
+  for (i = 0; i < BILATERAL_LEVELS; i++) {
+    const BilateralParamsType param = av1_bilateral_level_to_params(i, 0);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / BILATERAL_PARAM_PRECISION;
+    const double sigma_x_d = (double)sigma_x / BILATERAL_PARAM_PRECISION;
+    const double sigma_y_d = (double)sigma_y / BILATERAL_PARAM_PRECISION;
+
+    uint8_t *fr = bilateral_filter_coeffs_r[i] + BILATERAL_AMP_RANGE;
+    int j, x, y;
+    for (j = 0; j <= BILATERAL_AMP_RANGE; j++) {
+      fr[j] = (uint8_t)(0.5 +
+                        RESTORATION_FILT_STEP *
+                            exp(-(j * j) / (2 * sigma_r_d * sigma_r_d)));
+      fr[-j] = fr[j];
+    }
+    for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; y++) {
+      for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; x++) {
+        bilateral_filter_coeffs_s[i][y + RESTORATION_HALFWIN]
+                                 [x + RESTORATION_HALFWIN] = (uint8_t)(
+                                     0.5 +
+                                     RESTORATION_FILT_STEP *
+                                         exp(-(x * x) /
+                                                 (2 * sigma_x_d * sigma_x_d) -
+                                             (y * y) /
+                                                 (2 * sigma_y_d * sigma_y_d)));
+      }
+    }
+  }
+}
+
+int av1_bilateral_level_bits(const AV1_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME ? BILATERAL_LEVEL_BITS_KF
+                                     : BILATERAL_LEVEL_BITS;
+}
+
+void av1_loop_restoration_init(RestorationInternal *rst, RestorationInfo *rsi,
+                               int kf, int width, int height) {
+  int i, tile_idx;
+  rst->rsi = rsi;
+  rst->keyframe = kf;
+  rst->subsampling_x = 0;
+  rst->subsampling_y = 0;
+  rst->ntiles =
+      av1_get_rest_ntiles(width, height, &rst->tile_width, &rst->tile_height,
+                          &rst->nhtiles, &rst->nvtiles);
+  if (rsi->frame_restoration_type == RESTORE_WIENER) {
+    for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+      rsi->wiener_info[tile_idx].vfilter[RESTORATION_HALFWIN] =
+          rsi->wiener_info[tile_idx].hfilter[RESTORATION_HALFWIN] =
+              RESTORATION_FILT_STEP;
+      for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+        rsi->wiener_info[tile_idx].vfilter[RESTORATION_WIN - 1 - i] =
+            rsi->wiener_info[tile_idx].vfilter[i];
+        rsi->wiener_info[tile_idx].hfilter[RESTORATION_WIN - 1 - i] =
+            rsi->wiener_info[tile_idx].hfilter[i];
+        rsi->wiener_info[tile_idx].vfilter[RESTORATION_HALFWIN] -=
+            2 * rsi->wiener_info[tile_idx].vfilter[i];
+        rsi->wiener_info[tile_idx].hfilter[RESTORATION_HALFWIN] -=
+            2 * rsi->wiener_info[tile_idx].hfilter[i];
+      }
+    }
+  } else if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+    for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+      if (rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
+        rsi->wiener_info[tile_idx].vfilter[RESTORATION_HALFWIN] =
+            rsi->wiener_info[tile_idx].hfilter[RESTORATION_HALFWIN] =
+                RESTORATION_FILT_STEP;
+        for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+          rsi->wiener_info[tile_idx].vfilter[RESTORATION_WIN - 1 - i] =
+              rsi->wiener_info[tile_idx].vfilter[i];
+          rsi->wiener_info[tile_idx].hfilter[RESTORATION_WIN - 1 - i] =
+              rsi->wiener_info[tile_idx].hfilter[i];
+          rsi->wiener_info[tile_idx].vfilter[RESTORATION_HALFWIN] -=
+              2 * rsi->wiener_info[tile_idx].vfilter[i];
+          rsi->wiener_info[tile_idx].hfilter[RESTORATION_HALFWIN] -=
+              2 * rsi->wiener_info[tile_idx].hfilter[i];
+        }
+      }
+    }
+  }
+}
+
+static void loop_bilateral_filter_tile(uint8_t *data, int tile_idx, int width,
+                                       int height, int stride,
+                                       RestorationInternal *rst,
+                                       uint8_t *tmpdata, int tmpstride) {
+  int i, j, subtile_idx;
+  int h_start, h_end, v_start, v_end;
+  const int tile_width = rst->tile_width >> rst->subsampling_x;
+  const int tile_height = rst->tile_height >> rst->subsampling_y;
+
+  for (subtile_idx = 0; subtile_idx < BILATERAL_SUBTILES; ++subtile_idx) {
+    uint8_t *data_p, *tmpdata_p;
+    const int level = rst->rsi->bilateral_info[tile_idx].level[subtile_idx];
+    uint8_t(*wx_lut)[RESTORATION_WIN];
+    uint8_t *wr_lut_;
+
+    if (level < 0) continue;
+    wr_lut_ = (rst->keyframe ? bilateral_filter_coeffs_r_kf[level]
+                             : bilateral_filter_coeffs_r[level]) +
+              BILATERAL_AMP_RANGE;
+    wx_lut = rst->keyframe ? bilateral_filter_coeffs_s_kf[level]
+                           : bilateral_filter_coeffs_s[level];
+
+    av1_get_rest_tile_limits(tile_idx, subtile_idx, BILATERAL_SUBTILE_BITS,
+                             rst->nhtiles, rst->nvtiles, tile_width,
+                             tile_height, width, height, 1, 1, &h_start, &h_end,
+                             &v_start, &v_end);
+
+    data_p = data + h_start + v_start * stride;
+    tmpdata_p = tmpdata + h_start + v_start * tmpstride;
+
+    for (i = 0; i < (v_end - v_start); ++i) {
+      for (j = 0; j < (h_end - h_start); ++j) {
+        int x, y, wt;
+        int64_t flsum = 0, wtsum = 0;
+        uint8_t *data_p2 = data_p + j - RESTORATION_HALFWIN * stride;
+        for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; ++y) {
+          for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; ++x) {
+            wt = (int)wx_lut[y + RESTORATION_HALFWIN][x + RESTORATION_HALFWIN] *
+                 (int)wr_lut_[data_p2[x] - data_p[j]];
+            wtsum += (int64_t)wt;
+            flsum += (int64_t)wt * data_p2[x];
+          }
+          data_p2 += stride;
+        }
+        if (wtsum > 0)
+          tmpdata_p[j] = clip_pixel((int)((flsum + wtsum / 2) / wtsum));
+        else
+          tmpdata_p[j] = data_p[j];
+      }
+      tmpdata_p += tmpstride;
+      data_p += stride;
+    }
+    for (i = v_start; i < v_end; ++i) {
+      memcpy(data + i * stride + h_start, tmpdata + i * tmpstride + h_start,
+             (h_end - h_start) * sizeof(*data));
+    }
+  }
+}
+
+static void loop_bilateral_filter(uint8_t *data, int width, int height,
+                                  int stride, RestorationInternal *rst,
+                                  uint8_t *tmpdata, int tmpstride) {
+  int tile_idx;
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_bilateral_filter_tile(data, tile_idx, width, height, stride, rst,
+                               tmpdata, tmpstride);
+  }
+}
+
+uint8_t hor_sym_filter(uint8_t *d, int *hfilter) {
+  int32_t s =
+      (1 << (RESTORATION_FILT_BITS - 1)) + d[0] * hfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i] + d[-i]) * hfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel(s >> RESTORATION_FILT_BITS);
+}
+
+uint8_t ver_sym_filter(uint8_t *d, int stride, int *vfilter) {
+  int32_t s =
+      (1 << (RESTORATION_FILT_BITS - 1)) + d[0] * vfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i * stride] + d[-i * stride]) * vfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel(s >> RESTORATION_FILT_BITS);
+}
+
+static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
+                                    int height, int stride,
+                                    RestorationInternal *rst, uint8_t *tmpdata,
+                                    int tmpstride) {
+  const int tile_width = rst->tile_width >> rst->subsampling_x;
+  const int tile_height = rst->tile_height >> rst->subsampling_y;
+  int i, j;
+  int h_start, h_end, v_start, v_end;
+  uint8_t *data_p, *tmpdata_p;
+
+  if (rst->rsi->wiener_info[tile_idx].level == 0) return;
+  // Filter row-wise
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 1, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  tmpdata_p = tmpdata + h_start + v_start * tmpstride;
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      *tmpdata_p++ =
+          hor_sym_filter(data_p++, rst->rsi->wiener_info[tile_idx].hfilter);
+    }
+    data_p += stride - (h_end - h_start);
+    tmpdata_p += tmpstride - (h_end - h_start);
+  }
+  // Filter col-wise
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 1,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  tmpdata_p = tmpdata + h_start + v_start * tmpstride;
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      *data_p++ = ver_sym_filter(tmpdata_p++, tmpstride,
+                                 rst->rsi->wiener_info[tile_idx].vfilter);
+    }
+    data_p += stride - (h_end - h_start);
+    tmpdata_p += tmpstride - (h_end - h_start);
+  }
+}
+
+static void loop_wiener_filter(uint8_t *data, int width, int height, int stride,
+                               RestorationInternal *rst, uint8_t *tmpdata,
+                               int tmpstride) {
+  int i, tile_idx;
+  uint8_t *data_p, *tmpdata_p;
+
+  // Initialize tmp buffer
+  data_p = data;
+  tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst, tmpdata,
+                            tmpstride);
+  }
+}
+
+static void loop_switchable_filter(uint8_t *data, int width, int height,
+                                   int stride, RestorationInternal *rst,
+                                   uint8_t *tmpdata, int tmpstride) {
+  int i, tile_idx;
+  uint8_t *data_p, *tmpdata_p;
+
+  // Initialize tmp buffer
+  data_p = data;
+  tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    if (rst->rsi->restoration_type[tile_idx] == RESTORE_BILATERAL) {
+      loop_bilateral_filter_tile(data, tile_idx, width, height, stride, rst,
+                                 tmpdata, tmpstride);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
+      loop_wiener_filter_tile(data, tile_idx, width, height, stride, rst,
+                              tmpdata, tmpstride);
+    }
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static void loop_bilateral_filter_tile_highbd(uint16_t *data, int tile_idx,
+                                              int width, int height, int stride,
+                                              RestorationInternal *rst,
+                                              uint16_t *tmpdata, int tmpstride,
+                                              int bit_depth) {
+  const int tile_width = rst->tile_width >> rst->subsampling_x;
+  const int tile_height = rst->tile_height >> rst->subsampling_y;
+  int i, j, subtile_idx;
+  int h_start, h_end, v_start, v_end;
+  const int shift = bit_depth - 8;
+
+  for (subtile_idx = 0; subtile_idx < BILATERAL_SUBTILES; ++subtile_idx) {
+    uint16_t *data_p, *tmpdata_p;
+    const int level = rst->rsi->bilateral_info[tile_idx].level[subtile_idx];
+    uint8_t(*wx_lut)[RESTORATION_WIN];
+    uint8_t *wr_lut_;
+
+    if (level < 0) continue;
+    wr_lut_ = (rst->keyframe ? bilateral_filter_coeffs_r_kf[level]
+                             : bilateral_filter_coeffs_r[level]) +
+              BILATERAL_AMP_RANGE;
+    wx_lut = rst->keyframe ? bilateral_filter_coeffs_s_kf[level]
+                           : bilateral_filter_coeffs_s[level];
+    av1_get_rest_tile_limits(tile_idx, subtile_idx, BILATERAL_SUBTILE_BITS,
+                             rst->nhtiles, rst->nvtiles, tile_width,
+                             tile_height, width, height, 1, 1, &h_start, &h_end,
+                             &v_start, &v_end);
+
+    data_p = data + h_start + v_start * stride;
+    tmpdata_p = tmpdata + h_start + v_start * tmpstride;
+
+    for (i = 0; i < (v_end - v_start); ++i) {
+      for (j = 0; j < (h_end - h_start); ++j) {
+        int x, y, wt;
+        int64_t flsum = 0, wtsum = 0;
+        uint16_t *data_p2 = data_p + j - RESTORATION_HALFWIN * stride;
+        for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; ++y) {
+          for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; ++x) {
+            wt = (int)wx_lut[y + RESTORATION_HALFWIN][x + RESTORATION_HALFWIN] *
+                 (int)wr_lut_[(data_p2[x] >> shift) - (data_p[j] >> shift)];
+            wtsum += (int64_t)wt;
+            flsum += (int64_t)wt * data_p2[x];
+          }
+          data_p2 += stride;
+        }
+        if (wtsum > 0)
+          tmpdata_p[j] =
+              clip_pixel_highbd((int)((flsum + wtsum / 2) / wtsum), bit_depth);
+        else
+          tmpdata_p[j] = data_p[j];
+      }
+      tmpdata_p += tmpstride;
+      data_p += stride;
+    }
+    for (i = v_start; i < v_end; ++i) {
+      memcpy(data + i * stride + h_start, tmpdata + i * tmpstride + h_start,
+             (h_end - h_start) * sizeof(*data));
+    }
+  }
+}
+
+static void loop_bilateral_filter_highbd(uint8_t *data8, int width, int height,
+                                         int stride, RestorationInternal *rst,
+                                         uint8_t *tmpdata8, int tmpstride,
+                                         int bit_depth) {
+  int tile_idx;
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_bilateral_filter_tile_highbd(data, tile_idx, width, height, stride,
+                                      rst, tmpdata, tmpstride, bit_depth);
+  }
+}
+
+uint16_t hor_sym_filter_highbd(uint16_t *d, int *hfilter, int bd) {
+  int32_t s =
+      (1 << (RESTORATION_FILT_BITS - 1)) + d[0] * hfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i] + d[-i]) * hfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel_highbd(s >> RESTORATION_FILT_BITS, bd);
+}
+
+uint16_t ver_sym_filter_highbd(uint16_t *d, int stride, int *vfilter, int bd) {
+  int32_t s =
+      (1 << (RESTORATION_FILT_BITS - 1)) + d[0] * vfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i * stride] + d[-i * stride]) * vfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel_highbd(s >> RESTORATION_FILT_BITS, bd);
+}
+
+static void loop_wiener_filter_tile_highbd(uint16_t *data, int tile_idx,
+                                           int width, int height, int stride,
+                                           RestorationInternal *rst,
+                                           uint16_t *tmpdata, int tmpstride,
+                                           int bit_depth) {
+  const int tile_width = rst->tile_width >> rst->subsampling_x;
+  const int tile_height = rst->tile_height >> rst->subsampling_y;
+  int h_start, h_end, v_start, v_end;
+  int i, j;
+  uint16_t *data_p, *tmpdata_p;
+
+  if (rst->rsi->wiener_info[tile_idx].level == 0) return;
+  // Filter row-wise
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 1, 0,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  tmpdata_p = tmpdata + h_start + v_start * tmpstride;
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      *tmpdata_p++ = hor_sym_filter_highbd(
+          data_p++, rst->rsi->wiener_info[tile_idx].hfilter, bit_depth);
+    }
+    data_p += stride - (h_end - h_start);
+    tmpdata_p += tmpstride - (h_end - h_start);
+  }
+  // Filter col-wise
+  av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
+                           tile_width, tile_height, width, height, 0, 1,
+                           &h_start, &h_end, &v_start, &v_end);
+  data_p = data + h_start + v_start * stride;
+  tmpdata_p = tmpdata + h_start + v_start * tmpstride;
+  for (i = 0; i < (v_end - v_start); ++i) {
+    for (j = 0; j < (h_end - h_start); ++j) {
+      *data_p++ = ver_sym_filter_highbd(tmpdata_p++, tmpstride,
+                                        rst->rsi->wiener_info[tile_idx].vfilter,
+                                        bit_depth);
+    }
+    data_p += stride - (h_end - h_start);
+    tmpdata_p += tmpstride - (h_end - h_start);
+  }
+}
+
+static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
+                                      int stride, RestorationInternal *rst,
+                                      uint8_t *tmpdata8, int tmpstride,
+                                      int bit_depth) {
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+  int i, tile_idx;
+  uint16_t *data_p, *tmpdata_p;
+
+  // Initialize tmp buffer
+  data_p = data;
+  tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
+                                   tmpdata, tmpstride, bit_depth);
+  }
+}
+
+static void loop_switchable_filter_highbd(uint8_t *data8, int width, int height,
+                                          int stride, RestorationInternal *rst,
+                                          uint8_t *tmpdata8, int tmpstride,
+                                          int bit_depth) {
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+  int i, tile_idx;
+  uint16_t *data_p, *tmpdata_p;
+
+  // Initialize tmp buffer
+  data_p = data;
+  tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+  for (tile_idx = 0; tile_idx < rst->ntiles; ++tile_idx) {
+    if (rst->rsi->restoration_type[tile_idx] == RESTORE_BILATERAL) {
+      loop_bilateral_filter_tile_highbd(data, tile_idx, width, height, stride,
+                                        rst, tmpdata, tmpstride, bit_depth);
+    } else if (rst->rsi->restoration_type[tile_idx] == RESTORE_WIENER) {
+      loop_wiener_filter_tile_highbd(data, tile_idx, width, height, stride, rst,
+                                     tmpdata, tmpstride, bit_depth);
+    }
+  }
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+void av1_loop_restoration_rows(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                               int start_mi_row, int end_mi_row, int y_only) {
+  const int ywidth = frame->y_crop_width;
+  const int ystride = frame->y_stride;
+  const int uvwidth = frame->uv_crop_width;
+  const int uvstride = frame->uv_stride;
+  const int ystart = start_mi_row << MI_SIZE_LOG2;
+  const int uvstart = ystart >> cm->subsampling_y;
+  int yend = end_mi_row << MI_SIZE_LOG2;
+  int uvend = yend >> cm->subsampling_y;
+  restore_func_type restore_func =
+      cm->rst_internal.rsi->frame_restoration_type == RESTORE_BILATERAL
+          ? loop_bilateral_filter
+          : (cm->rst_internal.rsi->frame_restoration_type == RESTORE_WIENER
+                 ? loop_wiener_filter
+                 : loop_switchable_filter);
+#if CONFIG_AOM_HIGHBITDEPTH
+  restore_func_highbd_type restore_func_highbd =
+      cm->rst_internal.rsi->frame_restoration_type == RESTORE_BILATERAL
+          ? loop_bilateral_filter_highbd
+          : (cm->rst_internal.rsi->frame_restoration_type == RESTORE_WIENER
+                 ? loop_wiener_filter_highbd
+                 : loop_switchable_filter_highbd);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  YV12_BUFFER_CONFIG tmp_buf;
+
+  if (cm->rst_internal.rsi->frame_restoration_type == RESTORE_NONE) return;
+
+  memset(&tmp_buf, 0, sizeof(YV12_BUFFER_CONFIG));
+
+  yend = AOMMIN(yend, cm->height);
+  uvend = AOMMIN(uvend, cm->subsampling_y ? (cm->height + 1) >> 1 : cm->height);
+
+  if (aom_realloc_frame_buffer(
+          &tmp_buf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_AOM_HIGHBITDEPTH
+          cm->use_highbitdepth,
+#endif
+          AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL) < 0)
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate tmp restoration buffer");
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    restore_func_highbd(frame->y_buffer + ystart * ystride, ywidth,
+                        yend - ystart, ystride, &cm->rst_internal,
+                        tmp_buf.y_buffer + ystart * tmp_buf.y_stride,
+                        tmp_buf.y_stride, cm->bit_depth);
+  else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    restore_func(frame->y_buffer + ystart * ystride, ywidth, yend - ystart,
+                 ystride, &cm->rst_internal,
+                 tmp_buf.y_buffer + ystart * tmp_buf.y_stride,
+                 tmp_buf.y_stride);
+  if (!y_only) {
+    cm->rst_internal.subsampling_x = cm->subsampling_x;
+    cm->rst_internal.subsampling_y = cm->subsampling_y;
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      restore_func_highbd(frame->u_buffer + uvstart * uvstride, uvwidth,
+                          uvend - uvstart, uvstride, &cm->rst_internal,
+                          tmp_buf.u_buffer + uvstart * tmp_buf.uv_stride,
+                          tmp_buf.uv_stride, cm->bit_depth);
+      restore_func_highbd(frame->v_buffer + uvstart * uvstride, uvwidth,
+                          uvend - uvstart, uvstride, &cm->rst_internal,
+                          tmp_buf.v_buffer + uvstart * tmp_buf.uv_stride,
+                          tmp_buf.uv_stride, cm->bit_depth);
+    } else {
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      restore_func(frame->u_buffer + uvstart * uvstride, uvwidth,
+                   uvend - uvstart, uvstride, &cm->rst_internal,
+                   tmp_buf.u_buffer + uvstart * tmp_buf.uv_stride,
+                   tmp_buf.uv_stride);
+      restore_func(frame->v_buffer + uvstart * uvstride, uvwidth,
+                   uvend - uvstart, uvstride, &cm->rst_internal,
+                   tmp_buf.v_buffer + uvstart * tmp_buf.uv_stride,
+                   tmp_buf.uv_stride);
+#if CONFIG_AOM_HIGHBITDEPTH
+    }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  }
+  aom_free_frame_buffer(&tmp_buf);
+}
+
+void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+                                RestorationInfo *rsi, int y_only,
+                                int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  if (rsi->frame_restoration_type != RESTORE_NONE) {
+    start_mi_row = 0;
+    mi_rows_to_filter = cm->mi_rows;
+    if (partial_frame && cm->mi_rows > 8) {
+      start_mi_row = cm->mi_rows >> 1;
+      start_mi_row &= 0xfffffff8;
+      mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+    }
+    end_mi_row = start_mi_row + mi_rows_to_filter;
+    av1_loop_restoration_init(&cm->rst_internal, rsi,
+                              cm->frame_type == KEY_FRAME, cm->width,
+                              cm->height);
+    av1_loop_restoration_rows(frame, cm, start_mi_row, end_mi_row, y_only);
+  }
+}

diff --git a/av1/common/restoration.h b/av1/common/restoration.h
new file mode 100644
index 0000000..a5150ad
--- /dev/null
+++ b/av1/common/restoration.h

@@ -0,0 +1,156 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_RESTORATION_H_
+#define AV1_COMMON_RESTORATION_H_
+
+#include "aom_ports/mem.h"
+#include "./aom_config.h"
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BILATERAL_LEVEL_BITS_KF 4
+#define BILATERAL_LEVELS_KF (1 << BILATERAL_LEVEL_BITS_KF)
+#define BILATERAL_LEVEL_BITS 3
+#define BILATERAL_LEVELS (1 << BILATERAL_LEVEL_BITS)
+// #define DEF_BILATERAL_LEVEL     2
+
+#define RESTORATION_TILESIZE_SML 128
+#define RESTORATION_TILESIZE_BIG 256
+#define BILATERAL_SUBTILE_BITS 1
+#define BILATERAL_SUBTILES (1 << (2 * BILATERAL_SUBTILE_BITS))
+
+#define RESTORATION_HALFWIN 3
+#define RESTORATION_HALFWIN1 (RESTORATION_HALFWIN + 1)
+#define RESTORATION_WIN (2 * RESTORATION_HALFWIN + 1)
+#define RESTORATION_WIN2 ((RESTORATION_WIN) * (RESTORATION_WIN))
+
+#define RESTORATION_FILT_BITS 7
+#define RESTORATION_FILT_STEP (1 << RESTORATION_FILT_BITS)
+
+#define WIENER_FILT_TAP0_MINV (-5)
+#define WIENER_FILT_TAP1_MINV (-23)
+#define WIENER_FILT_TAP2_MINV (-16)
+
+#define WIENER_FILT_TAP0_BITS 4
+#define WIENER_FILT_TAP1_BITS 5
+#define WIENER_FILT_TAP2_BITS 6
+
+#define WIENER_FILT_BITS \
+  ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
+#define WIENER_FILT_TAP0_MAXV \
+  (WIENER_FILT_TAP0_MINV - 1 + (1 << WIENER_FILT_TAP0_BITS))
+#define WIENER_FILT_TAP1_MAXV \
+  (WIENER_FILT_TAP1_MINV - 1 + (1 << WIENER_FILT_TAP1_BITS))
+#define WIENER_FILT_TAP2_MAXV \
+  (WIENER_FILT_TAP2_MINV - 1 + (1 << WIENER_FILT_TAP2_BITS))
+
+typedef struct { int level[BILATERAL_SUBTILES]; } BilateralInfo;
+
+typedef struct {
+  int level;
+  int vfilter[RESTORATION_WIN], hfilter[RESTORATION_WIN];
+} WienerInfo;
+
+typedef struct {
+  RestorationType frame_restoration_type;
+  RestorationType *restoration_type;
+  // Bilateral filter
+  BilateralInfo *bilateral_info;
+  // Wiener filter
+  WienerInfo *wiener_info;
+} RestorationInfo;
+
+typedef struct {
+  RestorationInfo *rsi;
+  int keyframe;
+  int subsampling_x;
+  int subsampling_y;
+  int ntiles;
+  int tile_width, tile_height;
+  int nhtiles, nvtiles;
+} RestorationInternal;
+
+static INLINE int get_rest_tilesize(int width, int height) {
+  if (width * height <= 352 * 288)
+    return RESTORATION_TILESIZE_SML;
+  else
+    return RESTORATION_TILESIZE_BIG;
+}
+
+static INLINE int av1_get_rest_ntiles(int width, int height, int *tile_width,
+                                      int *tile_height, int *nhtiles,
+                                      int *nvtiles) {
+  int nhtiles_, nvtiles_;
+  int tile_width_, tile_height_;
+  int tilesize = get_rest_tilesize(width, height);
+  tile_width_ = (tilesize < 0) ? width : AOMMIN(tilesize, width);
+  tile_height_ = (tilesize < 0) ? height : AOMMIN(tilesize, height);
+  nhtiles_ = (width + (tile_width_ >> 1)) / tile_width_;
+  nvtiles_ = (height + (tile_height_ >> 1)) / tile_height_;
+  if (tile_width) *tile_width = tile_width_;
+  if (tile_height) *tile_height = tile_height_;
+  if (nhtiles) *nhtiles = nhtiles_;
+  if (nvtiles) *nvtiles = nvtiles_;
+  return (nhtiles_ * nvtiles_);
+}
+
+static INLINE void av1_get_rest_tile_limits(
+    int tile_idx, int subtile_idx, int subtile_bits, int nhtiles, int nvtiles,
+    int tile_width, int tile_height, int im_width, int im_height, int clamp_h,
+    int clamp_v, int *h_start, int *h_end, int *v_start, int *v_end) {
+  const int htile_idx = tile_idx % nhtiles;
+  const int vtile_idx = tile_idx / nhtiles;
+  *h_start = htile_idx * tile_width;
+  *v_start = vtile_idx * tile_height;
+  *h_end = (htile_idx < nhtiles - 1) ? *h_start + tile_width : im_width;
+  *v_end = (vtile_idx < nvtiles - 1) ? *v_start + tile_height : im_height;
+  if (subtile_bits) {
+    const int num_subtiles_1d = (1 << subtile_bits);
+    const int subtile_width = (*h_end - *h_start) >> subtile_bits;
+    const int subtile_height = (*v_end - *v_start) >> subtile_bits;
+    const int subtile_idx_h = subtile_idx & (num_subtiles_1d - 1);
+    const int subtile_idx_v = subtile_idx >> subtile_bits;
+    *h_start += subtile_idx_h * subtile_width;
+    *v_start += subtile_idx_v * subtile_height;
+    *h_end = subtile_idx_h == num_subtiles_1d - 1 ? *h_end
+                                                  : *h_start + subtile_width;
+    *v_end = subtile_idx_v == num_subtiles_1d - 1 ? *v_end
+                                                  : *v_start + subtile_height;
+  }
+  if (clamp_h) {
+    *h_start = AOMMAX(*h_start, RESTORATION_HALFWIN);
+    *h_end = AOMMIN(*h_end, im_width - RESTORATION_HALFWIN);
+  }
+  if (clamp_v) {
+    *v_start = AOMMAX(*v_start, RESTORATION_HALFWIN);
+    *v_end = AOMMIN(*v_end, im_height - RESTORATION_HALFWIN);
+  }
+}
+
+int av1_bilateral_level_bits(const struct AV1Common *const cm);
+void av1_loop_restoration_init(RestorationInternal *rst, RestorationInfo *rsi,
+                               int kf, int width, int height);
+void av1_loop_restoration_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                                RestorationInfo *rsi, int y_only,
+                                int partial_frame);
+void av1_loop_restoration_rows(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+                               int start_mi_row, int end_mi_row, int y_only);
+void av1_loop_restoration_precal();
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_COMMON_RESTORATION_H_

diff --git a/av1/common/scale.c b/av1/common/scale.c
index bc202e9..896c4e1 100644
--- a/av1/common/scale.c
+++ b/av1/common/scale.c

@@ -70,12 +70,22 @@
     sf->scale_value_y = unscaled_value;
   }
 
-  // TODO(agrange): Investigate the best choice of functions to use here
-  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
-  // to do at full-pel offsets. The current selection, where the filter is
-  // applied in one direction only, and not at all for 0,0, seems to give the
-  // best quality, but it may be worth trying an additional mode that does
-  // do the filtering on full-pel.
+// TODO(agrange): Investigate the best choice of functions to use here
+// for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+// to do at full-pel offsets. The current selection, where the filter is
+// applied in one direction only, and not at all for 0,0, seems to give the
+// best quality, but it may be worth trying an additional mode that does
+// do the filtering on full-pel.
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  sf->predict_ni[0][0][0] = aom_convolve8_c;
+  sf->predict_ni[0][0][1] = aom_convolve8_avg_c;
+  sf->predict_ni[0][1][0] = aom_convolve8_c;
+  sf->predict_ni[0][1][1] = aom_convolve8_avg_c;
+  sf->predict_ni[1][0][0] = aom_convolve8_c;
+  sf->predict_ni[1][0][1] = aom_convolve8_avg_c;
+  sf->predict_ni[1][1][0] = aom_convolve8;
+  sf->predict_ni[1][1][1] = aom_convolve8_avg;
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
   if (sf->x_step_q4 == 16) {
     if (sf->y_step_q4 == 16) {
       // No scaling in either direction.
@@ -116,8 +126,19 @@
   // 2D subpel motion always gets filtered in both directions
   sf->predict[1][1][0] = aom_convolve8;
   sf->predict[1][1][1] = aom_convolve8_avg;
+
 #if CONFIG_AOM_HIGHBITDEPTH
   if (use_highbd) {
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+    sf->highbd_predict_ni[0][0][0] = aom_highbd_convolve8_c;
+    sf->highbd_predict_ni[0][0][1] = aom_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[0][1][0] = aom_highbd_convolve8_c;
+    sf->highbd_predict_ni[0][1][1] = aom_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[1][0][0] = aom_highbd_convolve8_c;
+    sf->highbd_predict_ni[1][0][1] = aom_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[1][1][0] = aom_highbd_convolve8;
+    sf->highbd_predict_ni[1][1][1] = aom_highbd_convolve8_avg;
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
     if (sf->x_step_q4 == 16) {
       if (sf->y_step_q4 == 16) {
         // No scaling in either direction.
@@ -159,5 +180,5 @@
     sf->highbd_predict[1][1][0] = aom_highbd_convolve8;
     sf->highbd_predict[1][1][1] = aom_highbd_convolve8_avg;
   }
-#endif
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 }

diff --git a/av1/common/scale.h b/av1/common/scale.h
index 16ec349..0b49b68 100644
--- a/av1/common/scale.h
+++ b/av1/common/scale.h

@@ -1,12 +1,11 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #ifndef AV1_COMMON_SCALE_H_
@@ -35,7 +34,15 @@
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 #if CONFIG_AOM_HIGHBITDEPTH
   highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
-#endif
+#endif                                           // CONFIG_AOM_HIGHBITDEPTH
+
+// Functions for non-interpolating filters (those that filter zero offsets)
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  convolve_fn_t predict_ni[2][2][2];  // horiz, vert, avg
+#if CONFIG_AOM_HIGHBITDEPTH
+  highbd_convolve_fn_t highbd_predict_ni[2][2][2];  // horiz, vert, avg
+#endif                                              // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
 };
 
 MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
@@ -47,7 +54,7 @@
 #else
 void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
-#endif
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 
 static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&

diff --git a/av1/common/scan.c b/av1/common/scan.c
index ffab9fe..b2386b9 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c

@@ -18,6 +18,16 @@
   0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
   0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15,
 };
@@ -26,6 +36,40 @@
   0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
+  0,  1,  4,  5,  2,  8,  6,  9,  10, 3,  12, 7,  13, 11, 14, 16,
+  17, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
+  0,  1,  8,  9, 2,  16, 10, 17, 18, 3,  24, 11, 25, 19, 26, 4,
+  12, 27, 20, 5, 28, 13, 21, 29, 6,  14, 22, 30, 7,  15, 23, 31,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+#endif
+
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
   0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
   33, 19, 40, 12, 34, 27, 5,  41, 20, 48, 13, 35, 42, 28, 21, 6,
@@ -33,6 +77,22 @@
   45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
   0,  8,  16, 1,  24, 9,  32, 17, 2,  40, 25, 10, 33, 18, 48, 3,
   26, 41, 11, 56, 19, 34, 4,  49, 27, 42, 12, 35, 20, 57, 50, 28,
@@ -47,6 +107,304 @@
   58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
+  0,   1,   8,   2,   9,   16,  3,   10,  17,  24,  4,   11,  18,  25,  32,
+  5,   12,  19,  26,  33,  40,  6,   13,  20,  27,  34,  41,  48,  7,   14,
+  21,  28,  35,  42,  49,  56,  15,  22,  29,  36,  43,  50,  57,  64,  23,
+  30,  37,  44,  51,  58,  65,  72,  31,  38,  45,  52,  59,  66,  73,  80,
+  39,  46,  53,  60,  67,  74,  81,  88,  47,  54,  61,  68,  75,  82,  89,
+  96,  55,  62,  69,  76,  83,  90,  97,  104, 63,  70,  77,  84,  91,  98,
+  105, 112, 71,  78,  85,  92,  99,  106, 113, 120, 79,  86,  93,  100, 107,
+  114, 121, 87,  94,  101, 108, 115, 122, 95,  102, 109, 116, 123, 103, 110,
+  117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
+  0,   1,  16,  2,   17,  32,  3,  18, 33,  48,  4,   19,  34,  49,  64,  5,
+  20,  35, 50,  65,  80,  6,   21, 36, 51,  66,  81,  96,  7,   22,  37,  52,
+  67,  82, 97,  112, 8,   23,  38, 53, 68,  83,  98,  113, 9,   24,  39,  54,
+  69,  84, 99,  114, 10,  25,  40, 55, 70,  85,  100, 115, 11,  26,  41,  56,
+  71,  86, 101, 116, 12,  27,  42, 57, 72,  87,  102, 117, 13,  28,  43,  58,
+  73,  88, 103, 118, 14,  29,  44, 59, 74,  89,  104, 119, 15,  30,  45,  60,
+  75,  90, 105, 120, 31,  46,  61, 76, 91,  106, 121, 47,  62,  77,  92,  107,
+  122, 63, 78,  93,  108, 123, 79, 94, 109, 124, 95,  110, 125, 111, 126, 127,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
+  0,   1,   16,  2,   17,  32,  3,   18,  33,  48,  4,   19,  34,  49,  64,
+  5,   20,  35,  50,  65,  80,  6,   21,  36,  51,  66,  81,  96,  7,   22,
+  37,  52,  67,  82,  97,  112, 8,   23,  38,  53,  68,  83,  98,  113, 128,
+  9,   24,  39,  54,  69,  84,  99,  114, 129, 144, 10,  25,  40,  55,  70,
+  85,  100, 115, 130, 145, 160, 11,  26,  41,  56,  71,  86,  101, 116, 131,
+  146, 161, 176, 12,  27,  42,  57,  72,  87,  102, 117, 132, 147, 162, 177,
+  192, 13,  28,  43,  58,  73,  88,  103, 118, 133, 148, 163, 178, 193, 208,
+  14,  29,  44,  59,  74,  89,  104, 119, 134, 149, 164, 179, 194, 209, 224,
+  15,  30,  45,  60,  75,  90,  105, 120, 135, 150, 165, 180, 195, 210, 225,
+  240, 31,  46,  61,  76,  91,  106, 121, 136, 151, 166, 181, 196, 211, 226,
+  241, 256, 47,  62,  77,  92,  107, 122, 137, 152, 167, 182, 197, 212, 227,
+  242, 257, 272, 63,  78,  93,  108, 123, 138, 153, 168, 183, 198, 213, 228,
+  243, 258, 273, 288, 79,  94,  109, 124, 139, 154, 169, 184, 199, 214, 229,
+  244, 259, 274, 289, 304, 95,  110, 125, 140, 155, 170, 185, 200, 215, 230,
+  245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+  246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+  247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+  248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+  249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+  250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+  251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+  252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+  253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+  254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+  255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+  480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+  481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+  482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+  498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+  350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+  411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+  487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+  459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+  491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
+  0,   1,   32,  2,   33,  64,  3,   34,  65,  96,  4,   35,  66,  97,  128,
+  5,   36,  67,  98,  129, 160, 6,   37,  68,  99,  130, 161, 192, 7,   38,
+  69,  100, 131, 162, 193, 224, 8,   39,  70,  101, 132, 163, 194, 225, 256,
+  9,   40,  71,  102, 133, 164, 195, 226, 257, 288, 10,  41,  72,  103, 134,
+  165, 196, 227, 258, 289, 320, 11,  42,  73,  104, 135, 166, 197, 228, 259,
+  290, 321, 352, 12,  43,  74,  105, 136, 167, 198, 229, 260, 291, 322, 353,
+  384, 13,  44,  75,  106, 137, 168, 199, 230, 261, 292, 323, 354, 385, 416,
+  14,  45,  76,  107, 138, 169, 200, 231, 262, 293, 324, 355, 386, 417, 448,
+  15,  46,  77,  108, 139, 170, 201, 232, 263, 294, 325, 356, 387, 418, 449,
+  480, 16,  47,  78,  109, 140, 171, 202, 233, 264, 295, 326, 357, 388, 419,
+  450, 481, 17,  48,  79,  110, 141, 172, 203, 234, 265, 296, 327, 358, 389,
+  420, 451, 482, 18,  49,  80,  111, 142, 173, 204, 235, 266, 297, 328, 359,
+  390, 421, 452, 483, 19,  50,  81,  112, 143, 174, 205, 236, 267, 298, 329,
+  360, 391, 422, 453, 484, 20,  51,  82,  113, 144, 175, 206, 237, 268, 299,
+  330, 361, 392, 423, 454, 485, 21,  52,  83,  114, 145, 176, 207, 238, 269,
+  300, 331, 362, 393, 424, 455, 486, 22,  53,  84,  115, 146, 177, 208, 239,
+  270, 301, 332, 363, 394, 425, 456, 487, 23,  54,  85,  116, 147, 178, 209,
+  240, 271, 302, 333, 364, 395, 426, 457, 488, 24,  55,  86,  117, 148, 179,
+  210, 241, 272, 303, 334, 365, 396, 427, 458, 489, 25,  56,  87,  118, 149,
+  180, 211, 242, 273, 304, 335, 366, 397, 428, 459, 490, 26,  57,  88,  119,
+  150, 181, 212, 243, 274, 305, 336, 367, 398, 429, 460, 491, 27,  58,  89,
+  120, 151, 182, 213, 244, 275, 306, 337, 368, 399, 430, 461, 492, 28,  59,
+  90,  121, 152, 183, 214, 245, 276, 307, 338, 369, 400, 431, 462, 493, 29,
+  60,  91,  122, 153, 184, 215, 246, 277, 308, 339, 370, 401, 432, 463, 494,
+  30,  61,  92,  123, 154, 185, 216, 247, 278, 309, 340, 371, 402, 433, 464,
+  495, 31,  62,  93,  124, 155, 186, 217, 248, 279, 310, 341, 372, 403, 434,
+  465, 496, 63,  94,  125, 156, 187, 218, 249, 280, 311, 342, 373, 404, 435,
+  466, 497, 95,  126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467,
+  498, 127, 158, 189, 220, 251, 282, 313, 344, 375, 406, 437, 468, 499, 159,
+  190, 221, 252, 283, 314, 345, 376, 407, 438, 469, 500, 191, 222, 253, 284,
+  315, 346, 377, 408, 439, 470, 501, 223, 254, 285, 316, 347, 378, 409, 440,
+  471, 502, 255, 286, 317, 348, 379, 410, 441, 472, 503, 287, 318, 349, 380,
+  411, 442, 473, 504, 319, 350, 381, 412, 443, 474, 505, 351, 382, 413, 444,
+  475, 506, 383, 414, 445, 476, 507, 415, 446, 477, 508, 447, 478, 509, 479,
+  510, 511,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
   0,   16,  1,   32,  17,  2,   48,  33,  18,  3,   64,  34,  49,  19,  65,
   80,  50,  4,   35,  66,  20,  81,  96,  51,  5,   36,  82,  97,  67,  112,
@@ -68,6 +426,48 @@
   255,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
   0,   16,  32,  48,  1,   64,  17,  80,  33,  96,  49,  2,   65,  112, 18,
   81,  34,  128, 50,  97,  3,   66,  144, 19,  113, 35,  82,  160, 98,  51,
@@ -110,6 +510,167 @@
   255,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   0,    32,   1,    64,  33,   2,    96,   65,   34,   128,  3,    97,   66,
   160,  129,  35,   98,  4,    67,   130,  161,  192,  36,   99,   224,  5,
@@ -192,165 +753,1157 @@
   1020, 989,  958,  927, 1021, 990,  959,  1022, 991,  1023,
 };
 
-// Neighborhood 5-tuples for various scans and blocksizes,
-// in {top, left, topleft, topright, bottomleft} order
-// for each position in raster scan order.
-// -1 indicates the neighbor does not exist.
+#if CONFIG_EXT_TX
+// Scan over two rectangular vertical partitions one after the other
+DECLARE_ALIGNED(16, static const int16_t, v2_scan_32x32[1024]) = {
+  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
+  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
+  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
+  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
+  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
+  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
+  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
+  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
+  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
+  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
+  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
+  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
+  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
+  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
+  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
+  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
+  238,  455,  175,  301,  425,  485,  512,  513,  270,  456,  514,  207,  486,
+  364,  395,  515,  333,  426,  516,  239,  487,  302,  457,  517,  396,  271,
+  488,  544,  365,  427,  545,  518,  546,  334,  458,  547,  519,  548,  303,
+  489,  397,  428,  549,  366,  459,  520,  576,  335,  490,  550,  577,  578,
+  579,  521,  429,  551,  398,  460,  580,  367,  491,  581,  552,  522,  582,
+  608,  609,  430,  461,  610,  399,  492,  553,  611,  583,  523,  612,  613,
+  584,  554,  462,  431,  493,  614,  524,  640,  641,  642,  585,  643,  555,
+  615,  644,  463,  494,  586,  525,  616,  645,  556,  646,  672,  617,  673,
+  587,  674,  647,  495,  675,  526,  676,  557,  618,  648,  677,  588,  678,
+  527,  649,  619,  704,  558,  705,  706,  679,  589,  707,  650,  708,  620,
+  680,  709,  559,  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,
+  682,  652,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,
+  743,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,  685,  745,
+  774,  655,  775,  800,  801,  716,  746,  802,  803,  686,  776,  804,  747,
+  805,  717,  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  835,
+  808,  836,  779,  749,  837,  809,  719,  838,  780,  750,  810,  839,  864,
+  865,  866,  867,  840,  781,  868,  811,  751,  869,  841,  870,  812,  782,
+  842,  871,  896,  897,  898,  872,  899,  813,  843,  900,  783,  901,  873,
+  844,  902,  814,  874,  903,  928,  929,  845,  930,  904,  815,  875,  931,
+  932,  905,  933,  846,  876,  934,  906,  935,  877,  960,  847,  961,  962,
+  907,  936,  963,  964,  937,  878,  965,  908,  966,  938,  967,  909,  879,
+  992,  939,  993,  968,  994,  995,  996,  910,  969,  940,  997,  998,  970,
+  911,  941,  999,  971,  1000, 942,  1001, 972,  1002, 943,  973,  1003, 974,
+  1004, 975,  1005, 1006, 1007, 16,   48,   80,   112,  144,  176,  17,   49,
+  208,  81,   113,  145,  240,  177,  272,  18,   50,   209,  82,   114,  304,
+  241,  146,  178,  273,  336,  210,  19,   51,   83,   115,  305,  242,  147,
+  368,  179,  274,  337,  211,  20,   400,  52,   84,   306,  116,  243,  369,
+  148,  338,  180,  275,  432,  401,  212,  21,   53,   307,  85,   370,  244,
+  117,  464,  149,  433,  339,  276,  181,  402,  213,  308,  496,  371,  22,
+  54,   465,  86,   245,  118,  434,  150,  340,  277,  403,  182,  528,  497,
+  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,  341,  404,  151,
+  529,  560,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,   247,
+  561,  88,   530,  592,  342,  120,  405,  499,  152,  279,  468,  184,  374,
+  311,  437,  216,  562,  593,  531,  624,  25,   248,  500,  57,   406,  89,
+  343,  121,  469,  280,  153,  594,  185,  375,  563,  625,  438,  532,  656,
+  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,  595,  626,
+  281,  564,  657,  154,  376,  533,  688,  439,  186,  313,  502,  218,  408,
+  627,  596,  658,  250,  345,  471,  27,   59,   565,  689,  91,   123,  282,
+  534,  720,  155,  440,  377,  187,  503,  314,  628,  659,  219,  597,  690,
+  409,  472,  566,  721,  346,  251,  28,   60,   535,  752,  92,   124,  283,
+  441,  378,  156,  660,  504,  629,  691,  598,  722,  188,  315,  567,  753,
+  220,  410,  473,  347,  536,  784,  252,  29,   661,  692,  61,   93,   442,
+  630,  723,  284,  125,  379,  505,  599,  754,  157,  316,  568,  785,  189,
+  474,  411,  221,  537,  816,  693,  348,  662,  724,  253,  631,  755,  443,
+  30,   600,  786,  62,   506,  94,   285,  380,  126,  569,  817,  158,  317,
+  190,  475,  694,  725,  412,  663,  756,  538,  848,  222,  632,  787,  349,
+  254,  601,  818,  444,  507,  31,   63,   381,  286,  95,   570,  849,  726,
+  127,  695,  757,  664,  788,  159,  476,  318,  413,  539,  880,  191,  633,
+  819,  223,  350,  602,  850,  508,  255,  445,  727,  758,  696,  789,  571,
+  881,  382,  287,  665,  820,  477,  634,  851,  540,  912,  319,  414,  603,
+  882,  759,  728,  790,  351,  509,  697,  821,  446,  572,  913,  666,  852,
+  383,  635,  883,  478,  541,  944,  415,  760,  791,  604,  914,  729,  822,
+  698,  853,  510,  667,  884,  447,  573,  945,  636,  915,  792,  761,  823,
+  542,  976,  479,  730,  854,  605,  946,  699,  885,  668,  916,  511,  574,
+  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
+  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
+  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
+  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
+  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
+  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
+  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
+  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
+};
+
+// Scan over two rectangular horizontal partitions one after the other
+DECLARE_ALIGNED(16, static const int16_t, h2_scan_32x32[1024]) = {
+  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
+  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
+  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
+  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
+  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
+  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
+  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
+  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
+  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
+  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
+  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
+  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
+  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
+  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
+  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
+  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
+  238,  455,  175,  301,  425,  485,  16,   48,   80,   270,  456,  207,  486,
+  112,  364,  395,  333,  426,  144,  239,  487,  302,  457,  176,  396,  17,
+  271,  488,  49,   365,  427,  208,  81,   334,  458,  113,  145,  240,  303,
+  489,  397,  428,  177,  366,  459,  272,  18,   50,   209,  335,  490,  82,
+  114,  304,  241,  429,  146,  398,  460,  367,  491,  178,  273,  336,  210,
+  19,   51,   83,   430,  461,  399,  492,  115,  305,  242,  147,  368,  179,
+  274,  337,  462,  431,  493,  211,  20,   400,  52,   84,   306,  116,  243,
+  369,  148,  463,  494,  338,  180,  275,  432,  401,  212,  21,   53,   307,
+  85,   370,  244,  117,  495,  464,  149,  433,  339,  276,  181,  402,  213,
+  308,  496,  371,  22,   54,   465,  86,   245,  118,  434,  150,  340,  277,
+  403,  182,  497,  214,  466,  372,  309,  23,   55,   435,  87,   246,  119,
+  341,  404,  151,  278,  498,  183,  467,  373,  215,  310,  436,  24,   56,
+  247,  88,   342,  120,  405,  499,  152,  279,  468,  184,  374,  311,  437,
+  216,  25,   248,  500,  57,   406,  89,   343,  121,  469,  280,  153,  185,
+  375,  438,  312,  217,  501,  407,  249,  26,   344,  58,   90,   470,  122,
+  281,  154,  376,  439,  186,  313,  502,  218,  408,  250,  345,  471,  27,
+  59,   91,   123,  282,  155,  440,  377,  187,  503,  314,  219,  409,  472,
+  346,  251,  28,   60,   92,   124,  283,  441,  378,  156,  504,  188,  315,
+  220,  410,  473,  347,  252,  29,   61,   93,   442,  284,  125,  379,  505,
+  157,  316,  189,  474,  411,  221,  348,  253,  443,  30,   62,   506,  94,
+  285,  380,  126,  158,  317,  190,  475,  412,  222,  349,  254,  444,  507,
+  31,   63,   381,  286,  95,   127,  159,  476,  318,  413,  191,  223,  350,
+  508,  255,  445,  382,  287,  477,  319,  414,  351,  509,  446,  383,  478,
+  415,  510,  447,  479,  511,  512,  513,  514,  515,  516,  517,  544,  545,
+  518,  546,  547,  519,  548,  549,  520,  576,  550,  577,  578,  579,  521,
+  551,  580,  581,  552,  522,  582,  608,  609,  610,  553,  611,  583,  523,
+  612,  613,  584,  554,  614,  524,  640,  641,  642,  585,  643,  555,  615,
+  644,  586,  525,  616,  645,  556,  646,  672,  617,  673,  587,  674,  647,
+  675,  526,  676,  557,  618,  648,  677,  588,  678,  527,  649,  619,  704,
+  558,  705,  706,  679,  589,  707,  650,  708,  620,  680,  709,  528,  559,
+  590,  710,  651,  681,  736,  621,  737,  711,  738,  739,  682,  652,  529,
+  560,  740,  712,  591,  741,  622,  683,  713,  742,  653,  768,  769,  561,
+  743,  530,  592,  770,  714,  684,  771,  623,  772,  744,  654,  773,  715,
+  685,  745,  774,  562,  593,  531,  624,  655,  775,  800,  801,  716,  746,
+  802,  803,  686,  776,  804,  594,  563,  625,  747,  805,  717,  532,  656,
+  777,  806,  687,  748,  807,  778,  832,  833,  718,  834,  595,  626,  835,
+  564,  657,  808,  836,  533,  688,  779,  749,  837,  809,  719,  838,  780,
+  627,  596,  658,  750,  810,  839,  864,  565,  689,  865,  866,  867,  534,
+  720,  840,  781,  868,  811,  751,  869,  841,  628,  659,  597,  690,  870,
+  812,  782,  566,  721,  842,  871,  896,  535,  752,  897,  898,  872,  899,
+  813,  843,  660,  900,  783,  629,  691,  598,  722,  901,  873,  567,  753,
+  844,  902,  814,  874,  536,  784,  903,  661,  692,  928,  929,  630,  723,
+  845,  930,  904,  815,  875,  931,  599,  754,  932,  568,  785,  905,  933,
+  846,  876,  934,  537,  816,  693,  662,  724,  906,  631,  755,  935,  877,
+  600,  786,  960,  847,  961,  962,  907,  936,  963,  569,  817,  964,  937,
+  694,  725,  878,  965,  908,  663,  756,  538,  848,  966,  632,  787,  938,
+  601,  818,  967,  909,  879,  992,  939,  993,  968,  570,  849,  994,  726,
+  695,  757,  995,  664,  788,  996,  910,  969,  539,  880,  940,  633,  819,
+  997,  998,  602,  850,  970,  911,  941,  999,  727,  758,  696,  789,  571,
+  881,  971,  665,  820,  1000, 634,  851,  942,  540,  912,  1001, 972,  603,
+  882,  759,  728,  790,  1002, 697,  821,  943,  973,  572,  913,  666,  852,
+  1003, 635,  883,  974,  541,  944,  760,  791,  1004, 604,  914,  729,  822,
+  698,  853,  975,  667,  884,  573,  945,  1005, 636,  915,  792,  761,  823,
+  542,  976,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  1007, 574,
+  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
+  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
+  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
+  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
+  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
+  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
+  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
+  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
+};
+
+// Scan where the top left quarter is scanned first
+DECLARE_ALIGNED(16, static const int16_t, qtr_scan_32x32[1024]) = {
+  0,    1,    32,   33,   2,    64,   34,   65,   66,   3,    96,   35,   97,
+  67,   98,   4,    128,  36,   129,  99,   68,   130,  5,    100,  131,  160,
+  37,   161,  69,   162,  132,  101,  163,  6,    192,  38,   193,  70,   194,
+  133,  164,  102,  195,  7,    224,  39,   165,  225,  134,  196,  71,   226,
+  103,  227,  166,  197,  8,    256,  40,   135,  228,  257,  72,   258,  198,
+  104,  259,  167,  229,  136,  260,  9,    288,  41,   289,  73,   199,  230,
+  290,  168,  261,  105,  291,  137,  292,  231,  10,   200,  262,  320,  42,
+  321,  74,   322,  169,  293,  106,  323,  232,  263,  138,  324,  201,  294,
+  11,   352,  43,   353,  75,   170,  325,  354,  264,  107,  233,  295,  355,
+  202,  326,  139,  356,  12,   384,  44,   265,  296,  385,  171,  357,  76,
+  386,  234,  327,  108,  387,  203,  358,  140,  388,  297,  266,  328,  13,
+  172,  389,  416,  45,   235,  359,  417,  77,   418,  109,  419,  204,  390,
+  298,  329,  141,  267,  360,  420,  236,  391,  173,  421,  14,   448,  46,
+  449,  78,   330,  450,  299,  361,  110,  205,  422,  451,  268,  392,  142,
+  452,  237,  423,  174,  331,  362,  453,  15,   300,  393,  480,  47,   481,
+  79,   482,  206,  454,  269,  424,  111,  483,  143,  484,  363,  332,  394,
+  238,  455,  175,  301,  425,  485,  270,  456,  207,  486,  364,  395,  333,
+  426,  239,  487,  302,  457,  396,  271,  488,  365,  427,  334,  458,  303,
+  489,  397,  428,  366,  459,  335,  490,  429,  398,  460,  367,  491,  430,
+  461,  399,  492,  462,  431,  493,  463,  494,  495,  16,   512,  48,   513,
+  80,   514,  112,  515,  144,  516,  176,  517,  17,   544,  49,   545,  208,
+  518,  81,   546,  113,  547,  145,  240,  519,  548,  177,  549,  272,  520,
+  18,   576,  50,   209,  550,  577,  82,   578,  114,  579,  304,  521,  241,
+  551,  146,  580,  178,  581,  273,  552,  336,  522,  210,  582,  19,   608,
+  51,   609,  83,   610,  115,  305,  553,  611,  242,  583,  147,  368,  523,
+  612,  179,  613,  274,  584,  337,  554,  211,  614,  20,   400,  524,  640,
+  52,   641,  84,   642,  306,  585,  116,  643,  243,  369,  555,  615,  148,
+  644,  338,  586,  180,  275,  432,  525,  616,  645,  401,  556,  212,  646,
+  21,   672,  53,   307,  617,  673,  85,   370,  587,  674,  244,  647,  117,
+  675,  464,  526,  149,  676,  433,  557,  339,  618,  276,  648,  181,  677,
+  402,  588,  213,  678,  308,  496,  527,  649,  371,  619,  22,   704,  54,
+  465,  558,  705,  86,   706,  245,  679,  118,  434,  589,  707,  150,  340,
+  650,  708,  277,  403,  620,  680,  182,  709,  528,  497,  559,  214,  466,
+  590,  710,  372,  651,  309,  681,  23,   736,  55,   435,  621,  737,  87,
+  246,  711,  738,  119,  739,  341,  682,  404,  652,  151,  529,  560,  740,
+  278,  712,  498,  591,  183,  741,  467,  622,  373,  683,  215,  310,  713,
+  742,  436,  653,  24,   768,  56,   769,  247,  561,  743,  88,   530,  592,
+  770,  342,  714,  120,  405,  684,  771,  499,  623,  152,  772,  279,  744,
+  468,  654,  184,  773,  374,  715,  311,  437,  685,  745,  216,  774,  562,
+  593,  531,  624,  25,   248,  500,  655,  775,  800,  57,   801,  406,  716,
+  89,   343,  746,  802,  121,  803,  469,  686,  280,  776,  153,  804,  594,
+  185,  375,  563,  625,  747,  805,  438,  717,  532,  656,  312,  777,  217,
+  806,  501,  687,  407,  748,  249,  807,  26,   344,  778,  832,  58,   833,
+  90,   470,  718,  834,  122,  595,  626,  835,  281,  564,  657,  808,  154,
+  836,  376,  533,  688,  779,  439,  749,  186,  837,  313,  809,  502,  719,
+  218,  838,  408,  780,  627,  596,  658,  250,  345,  471,  750,  810,  839,
+  27,   864,  59,   565,  689,  865,  91,   866,  123,  867,  282,  534,  720,
+  840,  155,  440,  781,  868,  377,  811,  187,  503,  751,  869,  314,  841,
+  628,  659,  219,  597,  690,  870,  409,  812,  472,  782,  566,  721,  346,
+  842,  251,  871,  28,   896,  60,   535,  752,  897,  92,   898,  124,  283,
+  872,  899,  441,  813,  378,  843,  156,  660,  900,  504,  783,  629,  691,
+  598,  722,  188,  901,  315,  873,  567,  753,  220,  410,  844,  902,  473,
+  814,  347,  874,  536,  784,  252,  903,  29,   661,  692,  928,  61,   929,
+  93,   442,  630,  723,  845,  930,  284,  904,  125,  379,  505,  815,  875,
+  931,  599,  754,  157,  932,  316,  568,  785,  905,  189,  933,  474,  846,
+  411,  876,  221,  934,  537,  816,  693,  348,  662,  724,  906,  253,  631,
+  755,  935,  443,  877,  30,   600,  786,  960,  62,   506,  847,  961,  94,
+  962,  285,  380,  907,  936,  126,  963,  569,  817,  158,  964,  317,  937,
+  190,  475,  694,  725,  878,  965,  412,  908,  663,  756,  538,  848,  222,
+  966,  632,  787,  349,  938,  254,  601,  818,  967,  444,  909,  507,  879,
+  31,   992,  63,   381,  939,  993,  286,  968,  95,   570,  849,  994,  726,
+  127,  695,  757,  995,  664,  788,  159,  996,  476,  910,  318,  969,  413,
+  539,  880,  940,  191,  633,  819,  997,  223,  998,  350,  602,  850,  970,
+  508,  911,  255,  445,  941,  999,  727,  758,  696,  789,  571,  881,  382,
+  971,  287,  665,  820,  1000, 477,  634,  851,  942,  540,  912,  319,  1001,
+  414,  972,  603,  882,  759,  728,  790,  351,  1002, 509,  697,  821,  943,
+  446,  973,  572,  913,  666,  852,  383,  1003, 635,  883,  478,  974,  541,
+  944,  415,  760,  791,  1004, 604,  914,  729,  822,  698,  853,  510,  975,
+  667,  884,  447,  573,  945,  1005, 636,  915,  792,  761,  823,  542,  976,
+  479,  1006, 730,  854,  605,  946,  699,  885,  668,  916,  511,  1007, 574,
+  977,  793,  824,  637,  947,  762,  855,  731,  886,  543,  1008, 606,  978,
+  700,  917,  669,  948,  575,  825,  1009, 794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607,  1010, 670,  980,  826,  857,  795,  888,  764,
+  919,  639,  1011, 733,  950,  702,  981,  858,  827,  889,  796,  920,  671,
+  1012, 765,  951,  734,  982,  703,  1013, 859,  890,  828,  921,  797,  952,
+  766,  983,  735,  1014, 891,  860,  922,  829,  953,  798,  984,  767,  1015,
+  892,  923,  861,  954,  830,  985,  799,  1016, 924,  893,  955,  862,  986,
+  831,  1017, 925,  956,  894,  987,  863,  1018, 957,  926,  988,  895,  1019,
+  958,  989,  927,  1020, 990,  959,  1021, 991,  1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
+// Neighborhood 2-tuples for various scans and blocksizes,
+// in {top, left} order for each position in corresponding scan order.
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0,  0, 1, 4, 4, 4,  1,  1, 8,  8,  5,  8, 2,
+  0, 0, 0, 0, 4,  0, 1, 4, 4, 5,  5,  1, 8,  8,  5,  8, 2,
   2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4,  8,  8, 0, 0, 1, 4, 5,  8,  9,  12, 1,
+  1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0,  0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2,  2, 0, 0, 1,  4,  2,  5,  3,  6, 4,
+  4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 4, 4, 0, 0, 8, 8,  1,  1, 5, 5,  1,  1, 9,
-  9, 2, 2, 6, 6, 2, 2, 3, 3, 10, 10, 7, 7, 11, 11, 0, 0,
+  0,  0, 0, 0, 4, 4, 4, 0, 8, 8,  1,  4, 5,  8,  5,  1, 9,
+  12, 2, 5, 6, 9, 6, 2, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 1, 1,  4,  4,  2,  2,  5,  5,  4,  4, 8,
-  8, 6, 6, 8, 8, 9, 9, 12, 12, 10, 10, 13, 13, 14, 14, 0, 0,
+  0, 0, 0, 0, 0, 1, 1, 1, 1,  4, 2,  2,  2,  5,  4,  5, 5,
+  8, 3, 6, 8, 9, 6, 9, 9, 12, 7, 10, 10, 13, 11, 14, 0, 0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1,  4,  1,  1,  4,  4,  2,  5,  5,  8,  6,
+  9,  2,  2,  8,  8,  3,  6,  9,  12, 7,  10, 10, 13, 12, 12, 13, 16,
+  11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0, 0, 0,  0,  4,  4,  8,  8,  12, 12, 16, 16, 20, 20, 24, 24, 0,
+  0, 1, 4,  5,  8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 1,  1,
+  2, 5, 6,  9,  10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2,  2,  3,
+  6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  0,  0,  1,  4,  2,  5,  3,  6,  4,
+  4,  5,  8,  6,  9,  7,  10, 8,  8,  9,  12, 10, 13, 11, 14, 12, 12,
+  13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+  24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0,  0
+};
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0, 1,  8,  1,  1,  8,  8,  2,  9,  9, 16, 10,
+  17, 2,  2,  16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3,  3, 4,  11,
+  19, 26, 12, 19, 4,  4, 20, 27, 5,  12, 13, 20, 21, 28, 5, 5,  6,
+  13, 14, 21, 22, 29, 6, 6,  7,  14, 15, 22, 23, 30, 0,  0
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 0,  0,  1,  8,  9,  16, 17, 24, 1,
+  1,  2,  9,  10, 17, 18, 25, 2,  2,  3,  10, 11, 18, 19, 26, 3,  3,
+  4,  11, 12, 19, 20, 27, 4,  4,  5,  12, 13, 20, 21, 28, 5,  5,  6,
+  13, 14, 21, 22, 29, 6,  6,  7,  14, 15, 22, 23, 30, 0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,
+  0,  1,  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,
+  9,  16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17,
+  24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0,  0
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  8,  8,  0,  0,  16, 16, 1,  1,  24, 24, 9,  9,  1,  1,  32,
-  32, 17, 17, 2,  2,  25, 25, 10, 10, 40, 40, 2,  2,  18, 18, 33, 33, 3,  3,
-  48, 48, 11, 11, 26, 26, 3,  3,  41, 41, 19, 19, 34, 34, 4,  4,  27, 27, 12,
-  12, 49, 49, 42, 42, 20, 20, 4,  4,  35, 35, 5,  5,  28, 28, 50, 50, 43, 43,
-  13, 13, 36, 36, 5,  5,  21, 21, 51, 51, 29, 29, 6,  6,  44, 44, 14, 14, 6,
-  6,  37, 37, 52, 52, 22, 22, 7,  7,  30, 30, 45, 45, 15, 15, 38, 38, 23, 23,
-  53, 53, 31, 31, 46, 46, 39, 39, 54, 54, 47, 47, 55, 55, 0,  0,
+  0,  0,  0,  0,  8,  8,  8,  0,  16, 16, 1,  8,  24, 24, 9,  16, 9,  1,  32,
+  32, 17, 24, 2,  9,  25, 32, 10, 17, 40, 40, 10, 2,  18, 25, 33, 40, 3,  10,
+  48, 48, 11, 18, 26, 33, 11, 3,  41, 48, 19, 26, 34, 41, 4,  11, 27, 34, 12,
+  19, 49, 56, 42, 49, 20, 27, 12, 4,  35, 42, 5,  12, 28, 35, 50, 57, 43, 50,
+  13, 20, 36, 43, 13, 5,  21, 28, 51, 58, 29, 36, 6,  13, 44, 51, 14, 21, 14,
+  6,  37, 44, 52, 59, 22, 29, 7,  14, 30, 37, 45, 52, 15, 22, 38, 45, 23, 30,
+  53, 60, 31, 38, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0,  0,  1,
+  8,  9,  16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1,  1,  2,  9,  10, 17,
+  18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2,  2,  3,  10, 11, 18, 19, 26, 27,
+  34, 35, 42, 43, 50, 51, 58, 3,  3,  4,  11, 12, 19, 20, 27, 28, 35, 36, 43,
+  44, 51, 52, 59, 4,  4,  5,  12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53,
+  60, 5,  5,  6,  13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6,  6,
+  7,  14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0,  0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  0,  0,  1,
+  8,  2,  9,  3,  10, 4,  11, 5,  12, 6,  13, 7,  14, 8,  8,  9,  16, 10, 17,
+  11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20,
+  27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36,
+  30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39,
+  46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
+  49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0,  0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  1,  1,  0,  0,  8,  8,  2,  2,  8,  8,  9,  9,  3,  3,  16,
-  16, 10, 10, 16, 16, 4,  4,  17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24,
-  5,  5,  12, 12, 19, 19, 32, 32, 26, 26, 6,  6,  33, 33, 32, 32, 20, 20, 27,
-  27, 40, 40, 13, 13, 34, 34, 40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21,
-  42, 42, 14, 14, 48, 48, 36, 36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50,
-  50, 57, 57, 44, 44, 37, 37, 51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59,
-  38, 38, 60, 60, 46, 46, 53, 53, 54, 54, 61, 61, 62, 62, 0,  0,
+  0,  0,  0,  0,  1,  1,  0,  1,  1,  8,  2,  2,  8,  9,  2,  9,  3,  3,  9,
+  16, 3,  10, 16, 17, 4,  4,  10, 17, 17, 24, 4,  11, 11, 18, 18, 25, 24, 25,
+  5,  5,  5,  12, 12, 19, 25, 32, 19, 26, 6,  6,  26, 33, 32, 33, 13, 20, 20,
+  27, 33, 40, 6,  13, 27, 34, 40, 41, 34, 41, 21, 28, 28, 35, 41, 48, 14, 21,
+  35, 42, 7,  14, 48, 49, 29, 36, 42, 49, 36, 43, 22, 29, 49, 56, 15, 22, 43,
+  50, 50, 57, 37, 44, 30, 37, 44, 51, 23, 30, 51, 58, 45, 52, 38, 45, 52, 59,
+  31, 38, 53, 60, 39, 46, 46, 53, 47, 54, 54, 61, 55, 62, 0,  0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  9,  16, 16, 16, 2,  9,  2,
-  2,  10, 17, 17, 24, 24, 24, 3,  10, 3,  3,  18, 25, 25, 32, 11, 18, 32, 32,
-  4,  11, 26, 33, 19, 26, 4,  4,  33, 40, 12, 19, 40, 40, 5,  12, 27, 34, 34,
-  41, 20, 27, 13, 20, 5,  5,  41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6,  6,
+  0,  0,  0,  0,  8,  0,  8,  8,  1,  8,  9,  1,  9,  16, 16, 17, 2,  9,  10,
+  2,  10, 17, 17, 24, 24, 25, 3,  10, 11, 3,  18, 25, 25, 32, 11, 18, 32, 33,
+  4,  11, 26, 33, 19, 26, 12, 4,  33, 40, 12, 19, 40, 41, 5,  12, 27, 34, 34,
+  41, 20, 27, 13, 20, 13, 5,  41, 48, 48, 49, 28, 35, 35, 42, 21, 28, 6,  6,
   6,  13, 42, 49, 49, 56, 36, 43, 14, 21, 29, 36, 7,  14, 43, 50, 50, 57, 22,
   29, 37, 44, 15, 22, 44, 51, 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45,
   31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   8,   8,   8,   2,   2,   2,
+  9,   9,   16,  16,  16,  3,   3,   3,   10,  10,  17,  17,  24,  24,  24,
+  4,   4,   4,   11,  11,  18,  18,  25,  25,  32,  32,  32,  5,   5,   5,
+  12,  12,  19,  19,  26,  26,  33,  33,  40,  40,  40,  6,   6,   6,   13,
+  13,  20,  20,  27,  27,  34,  34,  41,  41,  48,  48,  48,  7,   14,  14,
+  21,  21,  28,  28,  35,  35,  42,  42,  49,  49,  56,  56,  56,  15,  22,
+  22,  29,  29,  36,  36,  43,  43,  50,  50,  57,  57,  64,  64,  64,  23,
+  30,  30,  37,  37,  44,  44,  51,  51,  58,  58,  65,  65,  72,  72,  72,
+  31,  38,  38,  45,  45,  52,  52,  59,  59,  66,  66,  73,  73,  80,  80,
+  80,  39,  46,  46,  53,  53,  60,  60,  67,  67,  74,  74,  81,  81,  88,
+  88,  88,  47,  54,  54,  61,  61,  68,  68,  75,  75,  82,  82,  89,  89,
+  96,  96,  96,  55,  62,  62,  69,  69,  76,  76,  83,  83,  90,  90,  97,
+  97,  104, 104, 104, 63,  70,  70,  77,  77,  84,  84,  91,  91,  98,  98,
+  105, 105, 112, 112, 112, 71,  78,  78,  85,  85,  92,  92,  99,  99,  106,
+  106, 113, 113, 120, 79,  86,  86,  93,  93,  100, 100, 107, 107, 114, 114,
+  121, 87,  94,  94,  101, 101, 108, 108, 115, 115, 122, 95,  102, 102, 109,
+  109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,  1,  1,   1,   16,  16,  16,  2,   2,   2,
+  17,  17,  32,  32,  32,  3,  3,  3,   18,  18,  33,  33,  48,  48,  48,
+  4,   4,   4,   19,  19,  34, 34, 49,  49,  64,  64,  64,  5,   5,   5,
+  20,  20,  35,  35,  50,  50, 65, 65,  80,  80,  80,  6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66, 66, 81,  81,  96,  96,  96,  7,   7,   7,
+  22,  22,  37,  37,  52,  52, 67, 67,  82,  82,  97,  97,  112, 8,   8,
+  8,   23,  23,  38,  38,  53, 53, 68,  68,  83,  83,  98,  98,  113, 9,
+  9,   9,   24,  24,  39,  39, 54, 54,  69,  69,  84,  84,  99,  99,  114,
+  10,  10,  10,  25,  25,  40, 40, 55,  55,  70,  70,  85,  85,  100, 100,
+  115, 11,  11,  11,  26,  26, 41, 41,  56,  56,  71,  71,  86,  86,  101,
+  101, 116, 12,  12,  12,  27, 27, 42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 13,  13,  13, 28, 28,  43,  43,  58,  58,  73,  73,  88,
+  88,  103, 103, 118, 14,  14, 14, 29,  29,  44,  44,  59,  59,  74,  74,
+  89,  89,  104, 104, 119, 15, 30, 30,  45,  45,  60,  60,  75,  75,  90,
+  90,  105, 105, 120, 31,  46, 46, 61,  61,  76,  76,  91,  91,  106, 106,
+  121, 47,  62,  62,  77,  77, 92, 92,  107, 107, 122, 63,  78,  78,  93,
+  93,  108, 108, 123, 79,  94, 94, 109, 109, 124, 95,  110, 110, 125, 111,
+  126, 0,   0
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  8,  8,  16, 16, 24, 24,  32,  32,  40,  40,  48,  48,
+  56, 56, 64, 64, 72, 72, 80, 80, 88, 88,  96,  96,  104, 104, 112, 112,
+  0,  0,  1,  8,  9,  16, 17, 24, 25, 32,  33,  40,  41,  48,  49,  56,
+  57, 64, 65, 72, 73, 80, 81, 88, 89, 96,  97,  104, 105, 112, 113, 120,
+  1,  1,  2,  9,  10, 17, 18, 25, 26, 33,  34,  41,  42,  49,  50,  57,
+  58, 65, 66, 73, 74, 81, 82, 89, 90, 97,  98,  105, 106, 113, 114, 121,
+  2,  2,  3,  10, 11, 18, 19, 26, 27, 34,  35,  42,  43,  50,  51,  58,
+  59, 66, 67, 74, 75, 82, 83, 90, 91, 98,  99,  106, 107, 114, 115, 122,
+  3,  3,  4,  11, 12, 19, 20, 27, 28, 35,  36,  43,  44,  51,  52,  59,
+  60, 67, 68, 75, 76, 83, 84, 91, 92, 99,  100, 107, 108, 115, 116, 123,
+  4,  4,  5,  12, 13, 20, 21, 28, 29, 36,  37,  44,  45,  52,  53,  60,
+  61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124,
+  5,  5,  6,  13, 14, 21, 22, 29, 30, 37,  38,  45,  46,  53,  54,  61,
+  62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125,
+  6,  6,  7,  14, 15, 22, 23, 30, 31, 38,  39,  46,  47,  54,  55,  62,
+  63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
+  0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  16, 16, 32, 32, 48, 48, 64, 64, 80, 80,  96,  96,
+  0,  0,  1,  16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96,  97,  112,
+  1,  1,  2,  17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97,  98,  113,
+  2,  2,  3,  18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98,  99,  114,
+  3,  3,  4,  19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99,  100, 115,
+  4,  4,  5,  20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
+  5,  5,  6,  21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
+  6,  6,  7,  22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
+  7,  7,  8,  23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
+  8,  8,  9,  24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
+  9,  9,  10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
+  10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
+  11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
+  12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
+  13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
+  14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
+  0,  0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   0,   0,   1,   8,   2,   9,   3,   10,  4,   11,  5,   12,  6,   13,
+  7,   14,  8,   8,   9,   16,  10,  17,  11,  18,  12,  19,  13,  20,  14,
+  21,  15,  22,  16,  16,  17,  24,  18,  25,  19,  26,  20,  27,  21,  28,
+  22,  29,  23,  30,  24,  24,  25,  32,  26,  33,  27,  34,  28,  35,  29,
+  36,  30,  37,  31,  38,  32,  32,  33,  40,  34,  41,  35,  42,  36,  43,
+  37,  44,  38,  45,  39,  46,  40,  40,  41,  48,  42,  49,  43,  50,  44,
+  51,  45,  52,  46,  53,  47,  54,  48,  48,  49,  56,  50,  57,  51,  58,
+  52,  59,  53,  60,  54,  61,  55,  62,  56,  56,  57,  64,  58,  65,  59,
+  66,  60,  67,  61,  68,  62,  69,  63,  70,  64,  64,  65,  72,  66,  73,
+  67,  74,  68,  75,  69,  76,  70,  77,  71,  78,  72,  72,  73,  80,  74,
+  81,  75,  82,  76,  83,  77,  84,  78,  85,  79,  86,  80,  80,  81,  88,
+  82,  89,  83,  90,  84,  91,  85,  92,  86,  93,  87,  94,  88,  88,  89,
+  96,  90,  97,  91,  98,  92,  99,  93,  100, 94,  101, 95,  102, 96,  96,
+  97,  104, 98,  105, 99,  106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
+  104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
+  112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
+  126, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 0,   0
+};
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   16,  16,  16,  2,   2,   2,
+  17,  17,  32,  32,  32,  3,   3,   3,   18,  18,  33,  33,  48,  48,  48,
+  4,   4,   4,   19,  19,  34,  34,  49,  49,  64,  64,  64,  5,   5,   5,
+  20,  20,  35,  35,  50,  50,  65,  65,  80,  80,  80,  6,   6,   6,   21,
+  21,  36,  36,  51,  51,  66,  66,  81,  81,  96,  96,  96,  7,   7,   7,
+  22,  22,  37,  37,  52,  52,  67,  67,  82,  82,  97,  97,  112, 112, 112,
+  8,   8,   8,   23,  23,  38,  38,  53,  53,  68,  68,  83,  83,  98,  98,
+  113, 113, 128, 128, 128, 9,   9,   9,   24,  24,  39,  39,  54,  54,  69,
+  69,  84,  84,  99,  99,  114, 114, 129, 129, 144, 144, 144, 10,  10,  10,
+  25,  25,  40,  40,  55,  55,  70,  70,  85,  85,  100, 100, 115, 115, 130,
+  130, 145, 145, 160, 160, 160, 11,  11,  11,  26,  26,  41,  41,  56,  56,
+  71,  71,  86,  86,  101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176,
+  176, 176, 12,  12,  12,  27,  27,  42,  42,  57,  57,  72,  72,  87,  87,
+  102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+  13,  13,  13,  28,  28,  43,  43,  58,  58,  73,  73,  88,  88,  103, 103,
+  118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208,
+  14,  14,  14,  29,  29,  44,  44,  59,  59,  74,  74,  89,  89,  104, 104,
+  119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+  224, 224, 15,  30,  30,  45,  45,  60,  60,  75,  75,  90,  90,  105, 105,
+  120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225,
+  225, 240, 240, 240, 31,  46,  46,  61,  61,  76,  76,  91,  91,  106, 106,
+  121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226,
+  226, 241, 241, 256, 256, 256, 47,  62,  62,  77,  77,  92,  92,  107, 107,
+  122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227,
+  227, 242, 242, 257, 257, 272, 272, 272, 63,  78,  78,  93,  93,  108, 108,
+  123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228,
+  228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79,  94,  94,  109, 109,
+  124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229,
+  229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95,  110, 110,
+  125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+  230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111,
+  126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231,
+  231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336,
+  336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232,
+  232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337,
+  352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233,
+  233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338,
+  353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+  234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339,
+  354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235,
+  235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340,
+  355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236,
+  236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341,
+  356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237,
+  237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342,
+  357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238,
+  238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343,
+  358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448,
+  239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344,
+  359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464,
+  464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345,
+  360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465,
+  465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346,
+  361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466,
+  466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362,
+  377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482,
+  482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393,
+  408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334,
+  334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439,
+  454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395,
+  395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351,
+  366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471,
+  471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442,
+  457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443,
+  443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444,
+  459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475,
+  475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447,
+  462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494,
+  494, 509, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   0,   0,   1,   1,   1,   32,  32,  32,  2,   2,   2,
+  33,  33,  64,  64,  64,  3,   3,   3,   34,  34,  65,  65,  96,  96,  96,
+  4,   4,   4,   35,  35,  66,  66,  97,  97,  128, 128, 128, 5,   5,   5,
+  36,  36,  67,  67,  98,  98,  129, 129, 160, 160, 160, 6,   6,   6,   37,
+  37,  68,  68,  99,  99,  130, 130, 161, 161, 192, 192, 192, 7,   7,   7,
+  38,  38,  69,  69,  100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224,
+  8,   8,   8,   39,  39,  70,  70,  101, 101, 132, 132, 163, 163, 194, 194,
+  225, 225, 256, 256, 256, 9,   9,   9,   40,  40,  71,  71,  102, 102, 133,
+  133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10,  10,  10,
+  41,  41,  72,  72,  103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258,
+  258, 289, 289, 320, 320, 320, 11,  11,  11,  42,  42,  73,  73,  104, 104,
+  135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352,
+  352, 352, 12,  12,  12,  43,  43,  74,  74,  105, 105, 136, 136, 167, 167,
+  198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
+  13,  13,  13,  44,  44,  75,  75,  106, 106, 137, 137, 168, 168, 199, 199,
+  230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416,
+  14,  14,  14,  45,  45,  76,  76,  107, 107, 138, 138, 169, 169, 200, 200,
+  231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448,
+  448, 448, 15,  15,  15,  46,  46,  77,  77,  108, 108, 139, 139, 170, 170,
+  201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418,
+  418, 449, 449, 480, 16,  16,  16,  47,  47,  78,  78,  109, 109, 140, 140,
+  171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388,
+  388, 419, 419, 450, 450, 481, 17,  17,  17,  48,  48,  79,  79,  110, 110,
+  141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358,
+  358, 389, 389, 420, 420, 451, 451, 482, 18,  18,  18,  49,  49,  80,  80,
+  111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328,
+  328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19,  19,  19,  50,  50,
+  81,  81,  112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298,
+  298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20,  20,  20,
+  51,  51,  82,  82,  113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268,
+  268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21,
+  21,  21,  52,  52,  83,  83,  114, 114, 145, 145, 176, 176, 207, 207, 238,
+  238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455,
+  486, 22,  22,  22,  53,  53,  84,  84,  115, 115, 146, 146, 177, 177, 208,
+  208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425,
+  456, 456, 487, 23,  23,  23,  54,  54,  85,  85,  116, 116, 147, 147, 178,
+  178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395,
+  426, 426, 457, 457, 488, 24,  24,  24,  55,  55,  86,  86,  117, 117, 148,
+  148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365,
+  396, 396, 427, 427, 458, 458, 489, 25,  25,  25,  56,  56,  87,  87,  118,
+  118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335,
+  366, 366, 397, 397, 428, 428, 459, 459, 490, 26,  26,  26,  57,  57,  88,
+  88,  119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305,
+  336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27,  27,  27,  58,
+  58,  89,  89,  120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275,
+  306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28,  28,
+  28,  59,  59,  90,  90,  121, 121, 152, 152, 183, 183, 214, 214, 245, 245,
+  276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493,
+  29,  29,  29,  60,  60,  91,  91,  122, 122, 153, 153, 184, 184, 215, 215,
+  246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463,
+  463, 494, 30,  30,  30,  61,  61,  92,  92,  123, 123, 154, 154, 185, 185,
+  216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433,
+  433, 464, 464, 495, 31,  62,  62,  93,  93,  124, 124, 155, 155, 186, 186,
+  217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434,
+  434, 465, 465, 496, 63,  94,  94,  125, 125, 156, 156, 187, 187, 218, 218,
+  249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466,
+  466, 497, 95,  126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281,
+  312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158,
+  158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375,
+  406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283,
+  283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191,
+  222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439,
+  439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378,
+  409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379,
+  379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380,
+  411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443,
+  443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383,
+  414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478,
+  478, 509, 479, 510, 0,   0
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
+  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+  224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336,
+  336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448,
+  464, 464, 480, 480, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,
+  80,  81,  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192,
+  193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305,
+  320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432,
+  433, 448, 449, 464, 465, 480, 481, 496, 1,   1,   2,   17,  18,  33,  34,
+  49,  50,  65,  66,  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161,
+  162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274,
+  289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401,
+  402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2,   2,   3,
+  18,  19,  34,  35,  50,  51,  66,  67,  82,  83,  98,  99,  114, 115, 130,
+  131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243,
+  258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370,
+  371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483,
+  498, 3,   3,   4,   19,  20,  35,  36,  51,  52,  67,  68,  83,  84,  99,
+  100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
+  227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339,
+  340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452,
+  467, 468, 483, 484, 499, 4,   4,   5,   20,  21,  36,  37,  52,  53,  68,
+  69,  84,  85,  100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181,
+  196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308,
+  309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421,
+  436, 437, 452, 453, 468, 469, 484, 485, 500, 5,   5,   6,   21,  22,  37,
+  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133, 134, 149, 150,
+  165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277,
+  278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390,
+  405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6,   6,
+  7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118, 119,
+  134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246,
+  247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359,
+  374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486,
+  487, 502, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,
+  103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215,
+  216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328,
+  343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455,
+  456, 471, 472, 487, 488, 503, 8,   8,   9,   24,  25,  40,  41,  56,  57,
+  72,  73,  88,  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184,
+  185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297,
+  312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424,
+  425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9,   9,   10,  25,  26,
+  41,  42,  57,  58,  73,  74,  89,  90,  105, 106, 121, 122, 137, 138, 153,
+  154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266,
+  281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393,
+  394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10,
+  10,  11,  26,  27,  42,  43,  58,  59,  74,  75,  90,  91,  106, 107, 122,
+  123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235,
+  250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362,
+  363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475,
+  490, 491, 506, 11,  11,  12,  27,  28,  43,  44,  59,  60,  75,  76,  91,
+  92,  107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204,
+  219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331,
+  332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444,
+  459, 460, 475, 476, 491, 492, 507, 12,  12,  13,  28,  29,  44,  45,  60,
+  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141, 156, 157, 172, 173,
+  188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300,
+  301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413,
+  428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13,  13,  14,  29,
+  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126, 141, 142,
+  157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269,
+  270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382,
+  397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509,
+  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
+  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+  239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351,
+  366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478,
+  479, 494, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   32,  32,  64,  64,  96,  96,  128, 128, 160, 160, 192,
+  192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416,
+  448, 448, 0,   0,   1,   32,  33,  64,  65,  96,  97,  128, 129, 160, 161,
+  192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
+  417, 448, 449, 480, 1,   1,   2,   33,  34,  65,  66,  97,  98,  129, 130,
+  161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
+  386, 417, 418, 449, 450, 481, 2,   2,   3,   34,  35,  66,  67,  98,  99,
+  130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354,
+  355, 386, 387, 418, 419, 450, 451, 482, 3,   3,   4,   35,  36,  67,  68,
+  99,  100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323,
+  324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4,   4,   5,   36,  37,
+  68,  69,  100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292,
+  293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5,   5,   6,
+  37,  38,  69,  70,  101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261,
+  262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6,
+  6,   7,   38,  39,  70,  71,  102, 103, 134, 135, 166, 167, 198, 199, 230,
+  231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455,
+  486, 7,   7,   8,   39,  40,  71,  72,  103, 104, 135, 136, 167, 168, 199,
+  200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424,
+  455, 456, 487, 8,   8,   9,   40,  41,  72,  73,  104, 105, 136, 137, 168,
+  169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393,
+  424, 425, 456, 457, 488, 9,   9,   10,  41,  42,  73,  74,  105, 106, 137,
+  138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362,
+  393, 394, 425, 426, 457, 458, 489, 10,  10,  11,  42,  43,  74,  75,  106,
+  107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331,
+  362, 363, 394, 395, 426, 427, 458, 459, 490, 11,  11,  12,  43,  44,  75,
+  76,  107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300,
+  331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12,  12,  13,  44,
+  45,  76,  77,  108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269,
+  300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13,  13,
+  14,  45,  46,  77,  78,  109, 110, 141, 142, 173, 174, 205, 206, 237, 238,
+  269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493,
+  14,  14,  15,  46,  47,  78,  79,  110, 111, 142, 143, 174, 175, 206, 207,
+  238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462,
+  463, 494, 15,  15,  16,  47,  48,  79,  80,  111, 112, 143, 144, 175, 176,
+  207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431,
+  432, 463, 464, 495, 16,  16,  17,  48,  49,  80,  81,  112, 113, 144, 145,
+  176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400,
+  401, 432, 433, 464, 465, 496, 17,  17,  18,  49,  50,  81,  82,  113, 114,
+  145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
+  370, 401, 402, 433, 434, 465, 466, 497, 18,  18,  19,  50,  51,  82,  83,
+  114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338,
+  339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19,  19,  20,  51,  52,
+  83,  84,  115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307,
+  308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20,  20,  21,
+  52,  53,  84,  85,  116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276,
+  277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21,
+  21,  22,  53,  54,  85,  86,  117, 118, 149, 150, 181, 182, 213, 214, 245,
+  246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470,
+  501, 22,  22,  23,  54,  55,  86,  87,  118, 119, 150, 151, 182, 183, 214,
+  215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439,
+  470, 471, 502, 23,  23,  24,  55,  56,  87,  88,  119, 120, 151, 152, 183,
+  184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
+  439, 440, 471, 472, 503, 24,  24,  25,  56,  57,  88,  89,  120, 121, 152,
+  153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377,
+  408, 409, 440, 441, 472, 473, 504, 25,  25,  26,  57,  58,  89,  90,  121,
+  122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346,
+  377, 378, 409, 410, 441, 442, 473, 474, 505, 26,  26,  27,  58,  59,  90,
+  91,  122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315,
+  346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27,  27,  28,  59,
+  60,  91,  92,  123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284,
+  315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28,  28,
+  29,  60,  61,  92,  93,  124, 125, 156, 157, 188, 189, 220, 221, 252, 253,
+  284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508,
+  29,  29,  30,  61,  62,  93,  94,  125, 126, 157, 158, 189, 190, 221, 222,
+  253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477,
+  478, 509, 30,  30,  31,  62,  63,  94,  95,  126, 127, 158, 159, 190, 191,
+  222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446,
+  447, 478, 479, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+  239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246,
+  261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268,
+  254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261,
+  276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283,
+  269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276,
+  291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298,
+  284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291,
+  306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313,
+  299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306,
+  321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328,
+  314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321,
+  336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343,
+  329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336,
+  336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358,
+  344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351,
+  366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373,
+  359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366,
+  381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388,
+  374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381,
+  396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403,
+  389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396,
+  411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418,
+  404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411,
+  426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433,
+  419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426,
+  441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448,
+  434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441,
+  456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448,
+  449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456,
+  471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478,
+  464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471,
+  486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493,
+  479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486,
+  501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508,
+  494, 509, 495, 510, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  15,  15,  16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,
+  21,  22,  22,  23,  23,  24,  24,  25,  25,  26,  26,  27,  27,  28,  28,
+  29,  29,  30,  30,  0,   0,   1,   32,  2,   33,  3,   34,  4,   35,  5,
+  36,  6,   37,  7,   38,  8,   39,  9,   40,  10,  41,  11,  42,  12,  43,
+  13,  44,  14,  45,  15,  46,  16,  47,  17,  48,  18,  49,  19,  50,  20,
+  51,  21,  52,  22,  53,  23,  54,  24,  55,  25,  56,  26,  57,  27,  58,
+  28,  59,  29,  60,  30,  61,  31,  62,  32,  32,  33,  64,  34,  65,  35,
+  66,  36,  67,  37,  68,  38,  69,  39,  70,  40,  71,  41,  72,  42,  73,
+  43,  74,  44,  75,  45,  76,  46,  77,  47,  78,  48,  79,  49,  80,  50,
+  81,  51,  82,  52,  83,  53,  84,  54,  85,  55,  86,  56,  87,  57,  88,
+  58,  89,  59,  90,  60,  91,  61,  92,  62,  93,  63,  94,  64,  64,  65,
+  96,  66,  97,  67,  98,  68,  99,  69,  100, 70,  101, 71,  102, 72,  103,
+  73,  104, 74,  105, 75,  106, 76,  107, 77,  108, 78,  109, 79,  110, 80,
+  111, 81,  112, 82,  113, 83,  114, 84,  115, 85,  116, 86,  117, 87,  118,
+  88,  119, 89,  120, 90,  121, 91,  122, 92,  123, 93,  124, 94,  125, 95,
+  126, 96,  96,  97,  128, 98,  129, 99,  130, 100, 131, 101, 132, 102, 133,
+  103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
+  141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
+  156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
+  133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
+  171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
+  148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
+  186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
+  163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
+  201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+  178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
+  216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
+  193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
+  231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
+  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
+  246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
+  223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230,
+  261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268,
+  238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245,
+  276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283,
+  253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260,
+  291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
+  268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275,
+  306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
+  283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290,
+  321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328,
+  298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305,
+  336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343,
+  313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320,
+  320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358,
+  328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335,
+  366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373,
+  343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350,
+  381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
+  358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365,
+  396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403,
+  373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380,
+  411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
+  388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395,
+  426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433,
+  403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410,
+  441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448,
+  418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425,
+  456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463,
+  433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440,
+  471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478,
+  448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455,
+  486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493,
+  463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470,
+  501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508,
+  478, 509, 479, 510, 0,   0
+};
+
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   16,  16,  32,  32,  48,  48,  64,  64,  80,  80,  96,
+  96,  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+  224, 224, 0,   0,   1,   16,  17,  32,  33,  48,  49,  64,  65,  80,  81,
+  96,  97,  112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208,
+  209, 224, 225, 240, 1,   1,   2,   17,  18,  33,  34,  49,  50,  65,  66,
+  81,  82,  97,  98,  113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193,
+  194, 209, 210, 225, 226, 241, 2,   2,   3,   18,  19,  34,  35,  50,  51,
+  66,  67,  82,  83,  98,  99,  114, 115, 130, 131, 146, 147, 162, 163, 178,
+  179, 194, 195, 210, 211, 226, 227, 242, 3,   3,   4,   19,  20,  35,  36,
+  51,  52,  67,  68,  83,  84,  99,  100, 115, 116, 131, 132, 147, 148, 163,
+  164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4,   4,   5,   20,  21,
+  36,  37,  52,  53,  68,  69,  84,  85,  100, 101, 116, 117, 132, 133, 148,
+  149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5,   5,   6,
+  21,  22,  37,  38,  53,  54,  69,  70,  85,  86,  101, 102, 117, 118, 133,
+  134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6,
+  6,   7,   22,  23,  38,  39,  54,  55,  70,  71,  86,  87,  102, 103, 118,
+  119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231,
+  246, 7,   7,   8,   23,  24,  39,  40,  55,  56,  71,  72,  87,  88,  103,
+  104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
+  231, 232, 247, 8,   8,   9,   24,  25,  40,  41,  56,  57,  72,  73,  88,
+  89,  104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201,
+  216, 217, 232, 233, 248, 9,   9,   10,  25,  26,  41,  42,  57,  58,  73,
+  74,  89,  90,  105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186,
+  201, 202, 217, 218, 233, 234, 249, 10,  10,  11,  26,  27,  42,  43,  58,
+  59,  74,  75,  90,  91,  106, 107, 122, 123, 138, 139, 154, 155, 170, 171,
+  186, 187, 202, 203, 218, 219, 234, 235, 250, 11,  11,  12,  27,  28,  43,
+  44,  59,  60,  75,  76,  91,  92,  107, 108, 123, 124, 139, 140, 155, 156,
+  171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12,  12,  13,  28,
+  29,  44,  45,  60,  61,  76,  77,  92,  93,  108, 109, 124, 125, 140, 141,
+  156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13,  13,
+  14,  29,  30,  45,  46,  61,  62,  77,  78,  93,  94,  109, 110, 125, 126,
+  141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253,
+  14,  14,  15,  30,  31,  46,  47,  62,  63,  78,  79,  94,  95,  110, 111,
+  126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+  239, 254, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,
+  6,   7,   7,   8,   8,   9,   9,   10,  10,  11,  11,  12,  12,  13,  13,
+  14,  14,  0,   0,   1,   16,  2,   17,  3,   18,  4,   19,  5,   20,  6,
+  21,  7,   22,  8,   23,  9,   24,  10,  25,  11,  26,  12,  27,  13,  28,
+  14,  29,  15,  30,  16,  16,  17,  32,  18,  33,  19,  34,  20,  35,  21,
+  36,  22,  37,  23,  38,  24,  39,  25,  40,  26,  41,  27,  42,  28,  43,
+  29,  44,  30,  45,  31,  46,  32,  32,  33,  48,  34,  49,  35,  50,  36,
+  51,  37,  52,  38,  53,  39,  54,  40,  55,  41,  56,  42,  57,  43,  58,
+  44,  59,  45,  60,  46,  61,  47,  62,  48,  48,  49,  64,  50,  65,  51,
+  66,  52,  67,  53,  68,  54,  69,  55,  70,  56,  71,  57,  72,  58,  73,
+  59,  74,  60,  75,  61,  76,  62,  77,  63,  78,  64,  64,  65,  80,  66,
+  81,  67,  82,  68,  83,  69,  84,  70,  85,  71,  86,  72,  87,  73,  88,
+  74,  89,  75,  90,  76,  91,  77,  92,  78,  93,  79,  94,  80,  80,  81,
+  96,  82,  97,  83,  98,  84,  99,  85,  100, 86,  101, 87,  102, 88,  103,
+  89,  104, 90,  105, 91,  106, 92,  107, 93,  108, 94,  109, 95,  110, 96,
+  96,  97,  112, 98,  113, 99,  114, 100, 115, 101, 116, 102, 117, 103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+  126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+  119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+  134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+  156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+  149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+  171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+  164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+  186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+  179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+  201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+  194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+  216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+  209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+  231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+  246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+  239, 254, 0,   0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   16,  16,  32,  32,  0,   0,   48,  48,  1,   1,   64,
-  64,  17,  17,  80,  80,  33,  33,  1,   1,   49,  49,  96,  96,  2,   2,
-  65,  65,  18,  18,  112, 112, 34,  34,  81,  81,  2,   2,   50,  50,  128,
-  128, 3,   3,   97,  97,  19,  19,  66,  66,  144, 144, 82,  82,  35,  35,
-  113, 113, 3,   3,   51,  51,  160, 160, 4,   4,   98,  98,  129, 129, 67,
-  67,  20,  20,  83,  83,  114, 114, 36,  36,  176, 176, 4,   4,   145, 145,
-  52,  52,  99,  99,  5,   5,   130, 130, 68,  68,  192, 192, 161, 161, 21,
-  21,  115, 115, 84,  84,  37,  37,  146, 146, 208, 208, 53,  53,  5,   5,
-  100, 100, 177, 177, 131, 131, 69,  69,  6,   6,   224, 224, 116, 116, 22,
-  22,  162, 162, 85,  85,  147, 147, 38,  38,  193, 193, 101, 101, 54,  54,
-  6,   6,   132, 132, 178, 178, 70,  70,  163, 163, 209, 209, 7,   7,   117,
-  117, 23,  23,  148, 148, 7,   7,   86,  86,  194, 194, 225, 225, 39,  39,
-  179, 179, 102, 102, 133, 133, 55,  55,  164, 164, 8,   8,   71,  71,  210,
-  210, 118, 118, 149, 149, 195, 195, 24,  24,  87,  87,  40,  40,  56,  56,
-  134, 134, 180, 180, 226, 226, 103, 103, 8,   8,   165, 165, 211, 211, 72,
-  72,  150, 150, 9,   9,   119, 119, 25,  25,  88,  88,  196, 196, 41,  41,
-  135, 135, 181, 181, 104, 104, 57,  57,  227, 227, 166, 166, 120, 120, 151,
-  151, 197, 197, 73,  73,  9,   9,   212, 212, 89,  89,  136, 136, 182, 182,
-  10,  10,  26,  26,  105, 105, 167, 167, 228, 228, 152, 152, 42,  42,  121,
-  121, 213, 213, 58,  58,  198, 198, 74,  74,  137, 137, 183, 183, 168, 168,
-  10,  10,  90,  90,  229, 229, 11,  11,  106, 106, 214, 214, 153, 153, 27,
-  27,  199, 199, 43,  43,  184, 184, 122, 122, 169, 169, 230, 230, 59,  59,
-  11,  11,  75,  75,  138, 138, 200, 200, 215, 215, 91,  91,  12,  12,  28,
-  28,  185, 185, 107, 107, 154, 154, 44,  44,  231, 231, 216, 216, 60,  60,
-  123, 123, 12,  12,  76,  76,  201, 201, 170, 170, 232, 232, 139, 139, 92,
-  92,  13,  13,  108, 108, 29,  29,  186, 186, 217, 217, 155, 155, 45,  45,
-  13,  13,  61,  61,  124, 124, 14,  14,  233, 233, 77,  77,  14,  14,  171,
-  171, 140, 140, 202, 202, 30,  30,  93,  93,  109, 109, 46,  46,  156, 156,
-  62,  62,  187, 187, 15,  15,  125, 125, 218, 218, 78,  78,  31,  31,  172,
-  172, 47,  47,  141, 141, 94,  94,  234, 234, 203, 203, 63,  63,  110, 110,
-  188, 188, 157, 157, 126, 126, 79,  79,  173, 173, 95,  95,  219, 219, 142,
-  142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220, 220,
-  143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221, 175,
-  175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223, 223,
-  239, 239, 0,   0,
+  0,   0,   0,   0,   16,  16,  32,  32,  16,  0,   48,  48,  1,   16,  64,
+  64,  17,  32,  80,  80,  33,  48,  17,  1,   49,  64,  96,  96,  2,   17,
+  65,  80,  18,  33,  112, 112, 34,  49,  81,  96,  18,  2,   50,  65,  128,
+  128, 3,   18,  97,  112, 19,  34,  66,  81,  144, 144, 82,  97,  35,  50,
+  113, 128, 19,  3,   51,  66,  160, 160, 4,   19,  98,  113, 129, 144, 67,
+  82,  20,  35,  83,  98,  114, 129, 36,  51,  176, 176, 20,  4,   145, 160,
+  52,  67,  99,  114, 5,   20,  130, 145, 68,  83,  192, 192, 161, 176, 21,
+  36,  115, 130, 84,  99,  37,  52,  146, 161, 208, 208, 53,  68,  21,  5,
+  100, 115, 177, 192, 131, 146, 69,  84,  6,   21,  224, 224, 116, 131, 22,
+  37,  162, 177, 85,  100, 147, 162, 38,  53,  193, 208, 101, 116, 54,  69,
+  22,  6,   132, 147, 178, 193, 70,  85,  163, 178, 209, 224, 7,   22,  117,
+  132, 23,  38,  148, 163, 23,  7,   86,  101, 194, 209, 225, 240, 39,  54,
+  179, 194, 102, 117, 133, 148, 55,  70,  164, 179, 8,   23,  71,  86,  210,
+  225, 118, 133, 149, 164, 195, 210, 24,  39,  87,  102, 40,  55,  56,  71,
+  134, 149, 180, 195, 226, 241, 103, 118, 24,  8,   165, 180, 211, 226, 72,
+  87,  150, 165, 9,   24,  119, 134, 25,  40,  88,  103, 196, 211, 41,  56,
+  135, 150, 181, 196, 104, 119, 57,  72,  227, 242, 166, 181, 120, 135, 151,
+  166, 197, 212, 73,  88,  25,  9,   212, 227, 89,  104, 136, 151, 182, 197,
+  10,  25,  26,  41,  105, 120, 167, 182, 228, 243, 152, 167, 42,  57,  121,
+  136, 213, 228, 58,  73,  198, 213, 74,  89,  137, 152, 183, 198, 168, 183,
+  26,  10,  90,  105, 229, 244, 11,  26,  106, 121, 214, 229, 153, 168, 27,
+  42,  199, 214, 43,  58,  184, 199, 122, 137, 169, 184, 230, 245, 59,  74,
+  27,  11,  75,  90,  138, 153, 200, 215, 215, 230, 91,  106, 12,  27,  28,
+  43,  185, 200, 107, 122, 154, 169, 44,  59,  231, 246, 216, 231, 60,  75,
+  123, 138, 28,  12,  76,  91,  201, 216, 170, 185, 232, 247, 139, 154, 92,
+  107, 13,  28,  108, 123, 29,  44,  186, 201, 217, 232, 155, 170, 45,  60,
+  29,  13,  61,  76,  124, 139, 14,  14,  233, 248, 77,  92,  14,  29,  171,
+  186, 140, 155, 202, 217, 30,  45,  93,  108, 109, 124, 46,  61,  156, 171,
+  62,  77,  187, 202, 15,  30,  125, 140, 218, 233, 78,  93,  31,  46,  172,
+  187, 47,  62,  141, 156, 94,  109, 234, 249, 203, 218, 63,  78,  110, 125,
+  188, 203, 157, 172, 126, 141, 79,  94,  173, 188, 95,  110, 219, 234, 142,
+  157, 204, 219, 235, 250, 111, 126, 158, 173, 127, 142, 189, 204, 220, 235,
+  143, 158, 174, 189, 205, 220, 236, 251, 159, 174, 190, 205, 221, 236, 175,
+  190, 237, 252, 206, 221, 222, 237, 191, 206, 238, 253, 207, 222, 223, 238,
+  239, 254, 0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   1,   1,   0,   0,   2,   2,   16,  16,  3,   3,   17,
-  17,  16,  16,  4,   4,   32,  32,  18,  18,  5,   5,   33,  33,  32,  32,
-  19,  19,  48,  48,  6,   6,   34,  34,  20,  20,  49,  49,  48,  48,  7,
-  7,   35,  35,  64,  64,  21,  21,  50,  50,  36,  36,  64,  64,  8,   8,
-  65,  65,  51,  51,  22,  22,  37,  37,  80,  80,  66,  66,  9,   9,   52,
-  52,  23,  23,  81,  81,  67,  67,  80,  80,  38,  38,  10,  10,  53,  53,
-  82,  82,  96,  96,  68,  68,  24,  24,  97,  97,  83,  83,  39,  39,  96,
-  96,  54,  54,  11,  11,  69,  69,  98,  98,  112, 112, 84,  84,  25,  25,
-  40,  40,  55,  55,  113, 113, 99,  99,  12,  12,  70,  70,  112, 112, 85,
-  85,  26,  26,  114, 114, 100, 100, 128, 128, 41,  41,  56,  56,  71,  71,
-  115, 115, 13,  13,  86,  86,  129, 129, 101, 101, 128, 128, 72,  72,  130,
-  130, 116, 116, 27,  27,  57,  57,  14,  14,  87,  87,  42,  42,  144, 144,
-  102, 102, 131, 131, 145, 145, 117, 117, 73,  73,  144, 144, 88,  88,  132,
-  132, 103, 103, 28,  28,  58,  58,  146, 146, 118, 118, 43,  43,  160, 160,
-  147, 147, 89,  89,  104, 104, 133, 133, 161, 161, 119, 119, 160, 160, 74,
-  74,  134, 134, 148, 148, 29,  29,  59,  59,  162, 162, 176, 176, 44,  44,
-  120, 120, 90,  90,  105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135,
-  135, 164, 164, 178, 178, 30,  30,  150, 150, 192, 192, 75,  75,  121, 121,
-  60,  60,  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45,
-  45,  165, 165, 166, 166, 194, 194, 91,  91,  180, 180, 137, 137, 208, 208,
-  122, 122, 152, 152, 208, 208, 195, 195, 76,  76,  167, 167, 209, 209, 181,
-  181, 224, 224, 107, 107, 196, 196, 61,  61,  153, 153, 224, 224, 182, 182,
-  168, 168, 210, 210, 46,  46,  138, 138, 92,  92,  183, 183, 225, 225, 211,
-  211, 240, 240, 197, 197, 169, 169, 123, 123, 154, 154, 198, 198, 77,  77,
-  212, 212, 184, 184, 108, 108, 226, 226, 199, 199, 62,  62,  227, 227, 241,
-  241, 139, 139, 213, 213, 170, 170, 185, 185, 155, 155, 228, 228, 242, 242,
-  124, 124, 93,  93,  200, 200, 243, 243, 214, 214, 215, 215, 229, 229, 140,
-  140, 186, 186, 201, 201, 78,  78,  171, 171, 109, 109, 156, 156, 244, 244,
-  216, 216, 230, 230, 94,  94,  245, 245, 231, 231, 125, 125, 202, 202, 246,
-  246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157, 157, 187, 187,
-  247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188, 203, 203, 142,
-  142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219, 219, 174, 174,
-  189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235, 206, 206, 236,
-  236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238, 238, 253, 253,
-  254, 254, 0,   0,
+  0,   0,   0,   0,   1,   1,   0,   1,   2,   2,   1,   16,  3,   3,   2,
+  17,  16,  17,  4,   4,   17,  32,  3,   18,  5,   5,   18,  33,  32,  33,
+  4,   19,  33,  48,  6,   6,   19,  34,  5,   20,  34,  49,  48,  49,  7,
+  7,   20,  35,  49,  64,  6,   21,  35,  50,  21,  36,  64,  65,  8,   8,
+  50,  65,  36,  51,  7,   22,  22,  37,  65,  80,  51,  66,  9,   9,   37,
+  52,  8,   23,  66,  81,  52,  67,  80,  81,  23,  38,  10,  10,  38,  53,
+  67,  82,  81,  96,  53,  68,  9,   24,  82,  97,  68,  83,  24,  39,  96,
+  97,  39,  54,  11,  11,  54,  69,  83,  98,  97,  112, 69,  84,  10,  25,
+  25,  40,  40,  55,  98,  113, 84,  99,  12,  12,  55,  70,  112, 113, 70,
+  85,  11,  26,  99,  114, 85,  100, 113, 128, 26,  41,  41,  56,  56,  71,
+  100, 115, 13,  13,  71,  86,  114, 129, 86,  101, 128, 129, 57,  72,  115,
+  130, 101, 116, 12,  27,  42,  57,  14,  14,  72,  87,  27,  42,  129, 144,
+  87,  102, 116, 131, 130, 145, 102, 117, 58,  73,  144, 145, 73,  88,  117,
+  132, 88,  103, 13,  28,  43,  58,  131, 146, 103, 118, 28,  43,  145, 160,
+  132, 147, 74,  89,  89,  104, 118, 133, 146, 161, 104, 119, 160, 161, 59,
+  74,  119, 134, 133, 148, 14,  29,  44,  59,  147, 162, 161, 176, 29,  44,
+  105, 120, 75,  90,  90,  105, 148, 163, 162, 177, 134, 149, 176, 177, 120,
+  135, 149, 164, 163, 178, 15,  30,  135, 150, 177, 192, 60,  75,  106, 121,
+  45,  60,  121, 136, 178, 193, 91,  106, 136, 151, 164, 179, 192, 193, 30,
+  45,  150, 165, 151, 166, 179, 194, 76,  91,  165, 180, 122, 137, 193, 208,
+  107, 122, 137, 152, 208, 209, 180, 195, 61,  76,  152, 167, 194, 209, 166,
+  181, 224, 224, 92,  107, 181, 196, 46,  61,  138, 153, 209, 224, 167, 182,
+  153, 168, 195, 210, 31,  46,  123, 138, 77,  92,  168, 183, 210, 225, 196,
+  211, 225, 240, 182, 197, 154, 169, 108, 123, 139, 154, 183, 198, 62,  77,
+  197, 212, 169, 184, 93,  108, 211, 226, 184, 199, 47,  62,  212, 227, 226,
+  241, 124, 139, 198, 213, 155, 170, 170, 185, 140, 155, 213, 228, 227, 242,
+  109, 124, 78,  93,  185, 200, 228, 243, 199, 214, 200, 215, 214, 229, 125,
+  140, 171, 186, 186, 201, 63,  78,  156, 171, 94,  109, 141, 156, 229, 244,
+  201, 216, 215, 230, 79,  94,  230, 245, 216, 231, 110, 125, 187, 202, 231,
+  246, 217, 232, 157, 172, 202, 217, 126, 141, 95,  110, 142, 157, 172, 187,
+  232, 247, 111, 126, 218, 233, 203, 218, 233, 248, 173, 188, 188, 203, 127,
+  142, 158, 173, 143, 158, 234, 249, 219, 234, 189, 204, 204, 219, 159, 174,
+  174, 189, 235, 250, 205, 220, 175, 190, 190, 205, 220, 235, 191, 206, 221,
+  236, 236, 251, 206, 221, 237, 252, 207, 222, 222, 237, 223, 238, 238, 253,
+  239, 254, 0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   32,  32,  17,
-  32,  2,   17,  2,   2,   48,  48,  18,  33,  33,  48,  3,   18,  49,  64,
-  64,  64,  34,  49,  3,   3,   19,  34,  50,  65,  4,   19,  65,  80,  80,
-  80,  35,  50,  4,   4,   20,  35,  66,  81,  81,  96,  51,  66,  96,  96,
-  5,   20,  36,  51,  82,  97,  21,  36,  67,  82,  97,  112, 5,   5,   52,
-  67,  112, 112, 37,  52,  6,   21,  83,  98,  98,  113, 68,  83,  6,   6,
-  113, 128, 22,  37,  53,  68,  84,  99,  99,  114, 128, 128, 114, 129, 69,
-  84,  38,  53,  7,   22,  7,   7,   129, 144, 23,  38,  54,  69,  100, 115,
-  85,  100, 115, 130, 144, 144, 130, 145, 39,  54,  70,  85,  8,   23,  55,
-  70,  116, 131, 101, 116, 145, 160, 24,  39,  8,   8,   86,  101, 131, 146,
-  160, 160, 146, 161, 71,  86,  40,  55,  9,   24,  117, 132, 102, 117, 161,
-  176, 132, 147, 56,  71,  87,  102, 25,  40,  147, 162, 9,   9,   176, 176,
+  0,   0,   0,   0,   16,  0,   16,  16,  1,   16,  17,  1,   32,  32,  17,
+  32,  2,   17,  18,  2,   48,  48,  18,  33,  33,  48,  3,   18,  49,  64,
+  64,  65,  34,  49,  19,  3,   19,  34,  50,  65,  4,   19,  65,  80,  80,
+  81,  35,  50,  20,  4,   20,  35,  66,  81,  81,  96,  51,  66,  96,  97,
+  5,   20,  36,  51,  82,  97,  21,  36,  67,  82,  97,  112, 21,  5,   52,
+  67,  112, 113, 37,  52,  6,   21,  83,  98,  98,  113, 68,  83,  22,  6,
+  113, 128, 22,  37,  53,  68,  84,  99,  99,  114, 128, 129, 114, 129, 69,
+  84,  38,  53,  7,   22,  23,  7,   129, 144, 23,  38,  54,  69,  100, 115,
+  85,  100, 115, 130, 144, 145, 130, 145, 39,  54,  70,  85,  8,   23,  55,
+  70,  116, 131, 101, 116, 145, 160, 24,  39,  24,  8,   86,  101, 131, 146,
+  160, 161, 146, 161, 71,  86,  40,  55,  9,   24,  117, 132, 102, 117, 161,
+  176, 132, 147, 56,  71,  87,  102, 25,  40,  147, 162, 25,  9,   176, 177,
   162, 177, 72,  87,  41,  56,  118, 133, 133, 148, 103, 118, 10,  25,  148,
-  163, 57,  72,  88,  103, 177, 192, 26,  41,  163, 178, 192, 192, 10,  10,
+  163, 57,  72,  88,  103, 177, 192, 26,  41,  163, 178, 192, 193, 26,  10,
   119, 134, 73,  88,  149, 164, 104, 119, 134, 149, 42,  57,  178, 193, 164,
   179, 11,  26,  58,  73,  193, 208, 89,  104, 135, 150, 120, 135, 27,  42,
-  74,  89,  208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43,
-  58,  11,  11,  136, 151, 90,  105, 151, 166, 180, 195, 59,  74,  121, 136,
-  209, 224, 195, 210, 224, 224, 166, 181, 106, 121, 75,  90,  12,  27,  181,
-  196, 12,  12,  210, 225, 152, 167, 167, 182, 137, 152, 28,  43,  196, 211,
+  74,  89,  208, 209, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43,
+  58,  27,  11,  136, 151, 90,  105, 151, 166, 180, 195, 59,  74,  121, 136,
+  209, 224, 195, 210, 224, 225, 166, 181, 106, 121, 75,  90,  12,  27,  181,
+  196, 28,  12,  210, 225, 152, 167, 167, 182, 137, 152, 28,  43,  196, 211,
   122, 137, 91,  106, 225, 240, 44,  59,  13,  28,  107, 122, 182, 197, 168,
   183, 211, 226, 153, 168, 226, 241, 60,  75,  197, 212, 138, 153, 29,  44,
-  76,  91,  13,  13,  183, 198, 123, 138, 45,  60,  212, 227, 198, 213, 154,
-  169, 169, 184, 227, 242, 92,  107, 61,  76,  139, 154, 14,  29,  14,  14,
+  76,  91,  29,  13,  183, 198, 123, 138, 45,  60,  212, 227, 198, 213, 154,
+  169, 169, 184, 227, 242, 92,  107, 61,  76,  139, 154, 14,  29,  30,  14,
   184, 199, 213, 228, 108, 123, 199, 214, 228, 243, 77,  92,  30,  45,  170,
   185, 155, 170, 185, 200, 93,  108, 124, 139, 214, 229, 46,  61,  200, 215,
   229, 244, 15,  30,  109, 124, 62,  77,  140, 155, 215, 230, 31,  46,  171,
@@ -364,84 +1917,388 @@
   239, 254, 0,   0,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    32,  32,   64,  64,   96,  96,   128, 128,  160, 160,
+  192, 192,  224, 224,  256, 256,  288, 288,  320, 320,  352, 352,  384, 384,
+  416, 416,  448, 448,  480, 480,  512, 512,  544, 544,  576, 576,  608, 608,
+  640, 640,  672, 672,  704, 704,  736, 736,  768, 768,  800, 800,  832, 832,
+  864, 864,  896, 896,  928, 928,  960, 960,  0,   0,    1,   32,   33,  64,
+  65,  96,   97,  128,  129, 160,  161, 192,  193, 224,  225, 256,  257, 288,
+  289, 320,  321, 352,  353, 384,  385, 416,  417, 448,  449, 480,  481, 512,
+  513, 544,  545, 576,  577, 608,  609, 640,  641, 672,  673, 704,  705, 736,
+  737, 768,  769, 800,  801, 832,  833, 864,  865, 896,  897, 928,  929, 960,
+  961, 992,  1,   1,    2,   33,   34,  65,   66,  97,   98,  129,  130, 161,
+  162, 193,  194, 225,  226, 257,  258, 289,  290, 321,  322, 353,  354, 385,
+  386, 417,  418, 449,  450, 481,  482, 513,  514, 545,  546, 577,  578, 609,
+  610, 641,  642, 673,  674, 705,  706, 737,  738, 769,  770, 801,  802, 833,
+  834, 865,  866, 897,  898, 929,  930, 961,  962, 993,  2,   2,    3,   34,
+  35,  66,   67,  98,   99,  130,  131, 162,  163, 194,  195, 226,  227, 258,
+  259, 290,  291, 322,  323, 354,  355, 386,  387, 418,  419, 450,  451, 482,
+  483, 514,  515, 546,  547, 578,  579, 610,  611, 642,  643, 674,  675, 706,
+  707, 738,  739, 770,  771, 802,  803, 834,  835, 866,  867, 898,  899, 930,
+  931, 962,  963, 994,  3,   3,    4,   35,   36,  67,   68,  99,   100, 131,
+  132, 163,  164, 195,  196, 227,  228, 259,  260, 291,  292, 323,  324, 355,
+  356, 387,  388, 419,  420, 451,  452, 483,  484, 515,  516, 547,  548, 579,
+  580, 611,  612, 643,  644, 675,  676, 707,  708, 739,  740, 771,  772, 803,
+  804, 835,  836, 867,  868, 899,  900, 931,  932, 963,  964, 995,  4,   4,
+  5,   36,   37,  68,   69,  100,  101, 132,  133, 164,  165, 196,  197, 228,
+  229, 260,  261, 292,  293, 324,  325, 356,  357, 388,  389, 420,  421, 452,
+  453, 484,  485, 516,  517, 548,  549, 580,  581, 612,  613, 644,  645, 676,
+  677, 708,  709, 740,  741, 772,  773, 804,  805, 836,  837, 868,  869, 900,
+  901, 932,  933, 964,  965, 996,  5,   5,    6,   37,   38,  69,   70,  101,
+  102, 133,  134, 165,  166, 197,  198, 229,  230, 261,  262, 293,  294, 325,
+  326, 357,  358, 389,  390, 421,  422, 453,  454, 485,  486, 517,  518, 549,
+  550, 581,  582, 613,  614, 645,  646, 677,  678, 709,  710, 741,  742, 773,
+  774, 805,  806, 837,  838, 869,  870, 901,  902, 933,  934, 965,  966, 997,
+  6,   6,    7,   38,   39,  70,   71,  102,  103, 134,  135, 166,  167, 198,
+  199, 230,  231, 262,  263, 294,  295, 326,  327, 358,  359, 390,  391, 422,
+  423, 454,  455, 486,  487, 518,  519, 550,  551, 582,  583, 614,  615, 646,
+  647, 678,  679, 710,  711, 742,  743, 774,  775, 806,  807, 838,  839, 870,
+  871, 902,  903, 934,  935, 966,  967, 998,  7,   7,    8,   39,   40,  71,
+  72,  103,  104, 135,  136, 167,  168, 199,  200, 231,  232, 263,  264, 295,
+  296, 327,  328, 359,  360, 391,  392, 423,  424, 455,  456, 487,  488, 519,
+  520, 551,  552, 583,  584, 615,  616, 647,  648, 679,  680, 711,  712, 743,
+  744, 775,  776, 807,  808, 839,  840, 871,  872, 903,  904, 935,  936, 967,
+  968, 999,  8,   8,    9,   40,   41,  72,   73,  104,  105, 136,  137, 168,
+  169, 200,  201, 232,  233, 264,  265, 296,  297, 328,  329, 360,  361, 392,
+  393, 424,  425, 456,  457, 488,  489, 520,  521, 552,  553, 584,  585, 616,
+  617, 648,  649, 680,  681, 712,  713, 744,  745, 776,  777, 808,  809, 840,
+  841, 872,  873, 904,  905, 936,  937, 968,  969, 1000, 9,   9,    10,  41,
+  42,  73,   74,  105,  106, 137,  138, 169,  170, 201,  202, 233,  234, 265,
+  266, 297,  298, 329,  330, 361,  362, 393,  394, 425,  426, 457,  458, 489,
+  490, 521,  522, 553,  554, 585,  586, 617,  618, 649,  650, 681,  682, 713,
+  714, 745,  746, 777,  778, 809,  810, 841,  842, 873,  874, 905,  906, 937,
+  938, 969,  970, 1001, 10,  10,   11,  42,   43,  74,   75,  106,  107, 138,
+  139, 170,  171, 202,  203, 234,  235, 266,  267, 298,  299, 330,  331, 362,
+  363, 394,  395, 426,  427, 458,  459, 490,  491, 522,  523, 554,  555, 586,
+  587, 618,  619, 650,  651, 682,  683, 714,  715, 746,  747, 778,  779, 810,
+  811, 842,  843, 874,  875, 906,  907, 938,  939, 970,  971, 1002, 11,  11,
+  12,  43,   44,  75,   76,  107,  108, 139,  140, 171,  172, 203,  204, 235,
+  236, 267,  268, 299,  300, 331,  332, 363,  364, 395,  396, 427,  428, 459,
+  460, 491,  492, 523,  524, 555,  556, 587,  588, 619,  620, 651,  652, 683,
+  684, 715,  716, 747,  748, 779,  780, 811,  812, 843,  844, 875,  876, 907,
+  908, 939,  940, 971,  972, 1003, 12,  12,   13,  44,   45,  76,   77,  108,
+  109, 140,  141, 172,  173, 204,  205, 236,  237, 268,  269, 300,  301, 332,
+  333, 364,  365, 396,  397, 428,  429, 460,  461, 492,  493, 524,  525, 556,
+  557, 588,  589, 620,  621, 652,  653, 684,  685, 716,  717, 748,  749, 780,
+  781, 812,  813, 844,  845, 876,  877, 908,  909, 940,  941, 972,  973, 1004,
+  13,  13,   14,  45,   46,  77,   78,  109,  110, 141,  142, 173,  174, 205,
+  206, 237,  238, 269,  270, 301,  302, 333,  334, 365,  366, 397,  398, 429,
+  430, 461,  462, 493,  494, 525,  526, 557,  558, 589,  590, 621,  622, 653,
+  654, 685,  686, 717,  718, 749,  750, 781,  782, 813,  814, 845,  846, 877,
+  878, 909,  910, 941,  942, 973,  974, 1005, 14,  14,   15,  46,   47,  78,
+  79,  110,  111, 142,  143, 174,  175, 206,  207, 238,  239, 270,  271, 302,
+  303, 334,  335, 366,  367, 398,  399, 430,  431, 462,  463, 494,  495, 526,
+  527, 558,  559, 590,  591, 622,  623, 654,  655, 686,  687, 718,  719, 750,
+  751, 782,  783, 814,  815, 846,  847, 878,  879, 910,  911, 942,  943, 974,
+  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111,  112, 143,  144, 175,
+  176, 207,  208, 239,  240, 271,  272, 303,  304, 335,  336, 367,  368, 399,
+  400, 431,  432, 463,  464, 495,  496, 527,  528, 559,  560, 591,  592, 623,
+  624, 655,  656, 687,  688, 719,  720, 751,  752, 783,  784, 815,  816, 847,
+  848, 879,  880, 911,  912, 943,  944, 975,  976, 1007, 16,  16,   17,  48,
+  49,  80,   81,  112,  113, 144,  145, 176,  177, 208,  209, 240,  241, 272,
+  273, 304,  305, 336,  337, 368,  369, 400,  401, 432,  433, 464,  465, 496,
+  497, 528,  529, 560,  561, 592,  593, 624,  625, 656,  657, 688,  689, 720,
+  721, 752,  753, 784,  785, 816,  817, 848,  849, 880,  881, 912,  913, 944,
+  945, 976,  977, 1008, 17,  17,   18,  49,   50,  81,   82,  113,  114, 145,
+  146, 177,  178, 209,  210, 241,  242, 273,  274, 305,  306, 337,  338, 369,
+  370, 401,  402, 433,  434, 465,  466, 497,  498, 529,  530, 561,  562, 593,
+  594, 625,  626, 657,  658, 689,  690, 721,  722, 753,  754, 785,  786, 817,
+  818, 849,  850, 881,  882, 913,  914, 945,  946, 977,  978, 1009, 18,  18,
+  19,  50,   51,  82,   83,  114,  115, 146,  147, 178,  179, 210,  211, 242,
+  243, 274,  275, 306,  307, 338,  339, 370,  371, 402,  403, 434,  435, 466,
+  467, 498,  499, 530,  531, 562,  563, 594,  595, 626,  627, 658,  659, 690,
+  691, 722,  723, 754,  755, 786,  787, 818,  819, 850,  851, 882,  883, 914,
+  915, 946,  947, 978,  979, 1010, 19,  19,   20,  51,   52,  83,   84,  115,
+  116, 147,  148, 179,  180, 211,  212, 243,  244, 275,  276, 307,  308, 339,
+  340, 371,  372, 403,  404, 435,  436, 467,  468, 499,  500, 531,  532, 563,
+  564, 595,  596, 627,  628, 659,  660, 691,  692, 723,  724, 755,  756, 787,
+  788, 819,  820, 851,  852, 883,  884, 915,  916, 947,  948, 979,  980, 1011,
+  20,  20,   21,  52,   53,  84,   85,  116,  117, 148,  149, 180,  181, 212,
+  213, 244,  245, 276,  277, 308,  309, 340,  341, 372,  373, 404,  405, 436,
+  437, 468,  469, 500,  501, 532,  533, 564,  565, 596,  597, 628,  629, 660,
+  661, 692,  693, 724,  725, 756,  757, 788,  789, 820,  821, 852,  853, 884,
+  885, 916,  917, 948,  949, 980,  981, 1012, 21,  21,   22,  53,   54,  85,
+  86,  117,  118, 149,  150, 181,  182, 213,  214, 245,  246, 277,  278, 309,
+  310, 341,  342, 373,  374, 405,  406, 437,  438, 469,  470, 501,  502, 533,
+  534, 565,  566, 597,  598, 629,  630, 661,  662, 693,  694, 725,  726, 757,
+  758, 789,  790, 821,  822, 853,  854, 885,  886, 917,  918, 949,  950, 981,
+  982, 1013, 22,  22,   23,  54,   55,  86,   87,  118,  119, 150,  151, 182,
+  183, 214,  215, 246,  247, 278,  279, 310,  311, 342,  343, 374,  375, 406,
+  407, 438,  439, 470,  471, 502,  503, 534,  535, 566,  567, 598,  599, 630,
+  631, 662,  663, 694,  695, 726,  727, 758,  759, 790,  791, 822,  823, 854,
+  855, 886,  887, 918,  919, 950,  951, 982,  983, 1014, 23,  23,   24,  55,
+  56,  87,   88,  119,  120, 151,  152, 183,  184, 215,  216, 247,  248, 279,
+  280, 311,  312, 343,  344, 375,  376, 407,  408, 439,  440, 471,  472, 503,
+  504, 535,  536, 567,  568, 599,  600, 631,  632, 663,  664, 695,  696, 727,
+  728, 759,  760, 791,  792, 823,  824, 855,  856, 887,  888, 919,  920, 951,
+  952, 983,  984, 1015, 24,  24,   25,  56,   57,  88,   89,  120,  121, 152,
+  153, 184,  185, 216,  217, 248,  249, 280,  281, 312,  313, 344,  345, 376,
+  377, 408,  409, 440,  441, 472,  473, 504,  505, 536,  537, 568,  569, 600,
+  601, 632,  633, 664,  665, 696,  697, 728,  729, 760,  761, 792,  793, 824,
+  825, 856,  857, 888,  889, 920,  921, 952,  953, 984,  985, 1016, 25,  25,
+  26,  57,   58,  89,   90,  121,  122, 153,  154, 185,  186, 217,  218, 249,
+  250, 281,  282, 313,  314, 345,  346, 377,  378, 409,  410, 441,  442, 473,
+  474, 505,  506, 537,  538, 569,  570, 601,  602, 633,  634, 665,  666, 697,
+  698, 729,  730, 761,  762, 793,  794, 825,  826, 857,  858, 889,  890, 921,
+  922, 953,  954, 985,  986, 1017, 26,  26,   27,  58,   59,  90,   91,  122,
+  123, 154,  155, 186,  187, 218,  219, 250,  251, 282,  283, 314,  315, 346,
+  347, 378,  379, 410,  411, 442,  443, 474,  475, 506,  507, 538,  539, 570,
+  571, 602,  603, 634,  635, 666,  667, 698,  699, 730,  731, 762,  763, 794,
+  795, 826,  827, 858,  859, 890,  891, 922,  923, 954,  955, 986,  987, 1018,
+  27,  27,   28,  59,   60,  91,   92,  123,  124, 155,  156, 187,  188, 219,
+  220, 251,  252, 283,  284, 315,  316, 347,  348, 379,  380, 411,  412, 443,
+  444, 475,  476, 507,  508, 539,  540, 571,  572, 603,  604, 635,  636, 667,
+  668, 699,  700, 731,  732, 763,  764, 795,  796, 827,  828, 859,  860, 891,
+  892, 923,  924, 955,  956, 987,  988, 1019, 28,  28,   29,  60,   61,  92,
+  93,  124,  125, 156,  157, 188,  189, 220,  221, 252,  253, 284,  285, 316,
+  317, 348,  349, 380,  381, 412,  413, 444,  445, 476,  477, 508,  509, 540,
+  541, 572,  573, 604,  605, 636,  637, 668,  669, 700,  701, 732,  733, 764,
+  765, 796,  797, 828,  829, 860,  861, 892,  893, 924,  925, 956,  957, 988,
+  989, 1020, 29,  29,   30,  61,   62,  93,   94,  125,  126, 157,  158, 189,
+  190, 221,  222, 253,  254, 285,  286, 317,  318, 349,  350, 381,  382, 413,
+  414, 445,  446, 477,  478, 509,  510, 541,  542, 573,  574, 605,  606, 637,
+  638, 669,  670, 701,  702, 733,  734, 765,  766, 797,  798, 829,  830, 861,
+  862, 893,  894, 925,  926, 957,  958, 989,  990, 1021, 30,  30,   31,  62,
+  63,  94,   95,  126,  127, 158,  159, 190,  191, 222,  223, 254,  255, 286,
+  287, 318,  319, 350,  351, 382,  383, 414,  415, 446,  447, 478,  479, 510,
+  511, 542,  543, 574,  575, 606,  607, 638,  639, 670,  671, 702,  703, 734,
+  735, 766,  767, 798,  799, 830,  831, 862,  863, 894,  895, 926,  927, 958,
+  959, 990,  991, 1022, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    1,   1,    2,   2,    3,   3,    4,   4,    5,   5,
+  6,   6,    7,   7,    8,   8,    9,   9,    10,  10,   11,  11,   12,  12,
+  13,  13,   14,  14,   15,  15,   16,  16,   17,  17,   18,  18,   19,  19,
+  20,  20,   21,  21,   22,  22,   23,  23,   24,  24,   25,  25,   26,  26,
+  27,  27,   28,  28,   29,  29,   30,  30,   0,   0,    1,   32,   2,   33,
+  3,   34,   4,   35,   5,   36,   6,   37,   7,   38,   8,   39,   9,   40,
+  10,  41,   11,  42,   12,  43,   13,  44,   14,  45,   15,  46,   16,  47,
+  17,  48,   18,  49,   19,  50,   20,  51,   21,  52,   22,  53,   23,  54,
+  24,  55,   25,  56,   26,  57,   27,  58,   28,  59,   29,  60,   30,  61,
+  31,  62,   32,  32,   33,  64,   34,  65,   35,  66,   36,  67,   37,  68,
+  38,  69,   39,  70,   40,  71,   41,  72,   42,  73,   43,  74,   44,  75,
+  45,  76,   46,  77,   47,  78,   48,  79,   49,  80,   50,  81,   51,  82,
+  52,  83,   53,  84,   54,  85,   55,  86,   56,  87,   57,  88,   58,  89,
+  59,  90,   60,  91,   61,  92,   62,  93,   63,  94,   64,  64,   65,  96,
+  66,  97,   67,  98,   68,  99,   69,  100,  70,  101,  71,  102,  72,  103,
+  73,  104,  74,  105,  75,  106,  76,  107,  77,  108,  78,  109,  79,  110,
+  80,  111,  81,  112,  82,  113,  83,  114,  84,  115,  85,  116,  86,  117,
+  87,  118,  88,  119,  89,  120,  90,  121,  91,  122,  92,  123,  93,  124,
+  94,  125,  95,  126,  96,  96,   97,  128,  98,  129,  99,  130,  100, 131,
+  101, 132,  102, 133,  103, 134,  104, 135,  105, 136,  106, 137,  107, 138,
+  108, 139,  109, 140,  110, 141,  111, 142,  112, 143,  113, 144,  114, 145,
+  115, 146,  116, 147,  117, 148,  118, 149,  119, 150,  120, 151,  121, 152,
+  122, 153,  123, 154,  124, 155,  125, 156,  126, 157,  127, 158,  128, 128,
+  129, 160,  130, 161,  131, 162,  132, 163,  133, 164,  134, 165,  135, 166,
+  136, 167,  137, 168,  138, 169,  139, 170,  140, 171,  141, 172,  142, 173,
+  143, 174,  144, 175,  145, 176,  146, 177,  147, 178,  148, 179,  149, 180,
+  150, 181,  151, 182,  152, 183,  153, 184,  154, 185,  155, 186,  156, 187,
+  157, 188,  158, 189,  159, 190,  160, 160,  161, 192,  162, 193,  163, 194,
+  164, 195,  165, 196,  166, 197,  167, 198,  168, 199,  169, 200,  170, 201,
+  171, 202,  172, 203,  173, 204,  174, 205,  175, 206,  176, 207,  177, 208,
+  178, 209,  179, 210,  180, 211,  181, 212,  182, 213,  183, 214,  184, 215,
+  185, 216,  186, 217,  187, 218,  188, 219,  189, 220,  190, 221,  191, 222,
+  192, 192,  193, 224,  194, 225,  195, 226,  196, 227,  197, 228,  198, 229,
+  199, 230,  200, 231,  201, 232,  202, 233,  203, 234,  204, 235,  205, 236,
+  206, 237,  207, 238,  208, 239,  209, 240,  210, 241,  211, 242,  212, 243,
+  213, 244,  214, 245,  215, 246,  216, 247,  217, 248,  218, 249,  219, 250,
+  220, 251,  221, 252,  222, 253,  223, 254,  224, 224,  225, 256,  226, 257,
+  227, 258,  228, 259,  229, 260,  230, 261,  231, 262,  232, 263,  233, 264,
+  234, 265,  235, 266,  236, 267,  237, 268,  238, 269,  239, 270,  240, 271,
+  241, 272,  242, 273,  243, 274,  244, 275,  245, 276,  246, 277,  247, 278,
+  248, 279,  249, 280,  250, 281,  251, 282,  252, 283,  253, 284,  254, 285,
+  255, 286,  256, 256,  257, 288,  258, 289,  259, 290,  260, 291,  261, 292,
+  262, 293,  263, 294,  264, 295,  265, 296,  266, 297,  267, 298,  268, 299,
+  269, 300,  270, 301,  271, 302,  272, 303,  273, 304,  274, 305,  275, 306,
+  276, 307,  277, 308,  278, 309,  279, 310,  280, 311,  281, 312,  282, 313,
+  283, 314,  284, 315,  285, 316,  286, 317,  287, 318,  288, 288,  289, 320,
+  290, 321,  291, 322,  292, 323,  293, 324,  294, 325,  295, 326,  296, 327,
+  297, 328,  298, 329,  299, 330,  300, 331,  301, 332,  302, 333,  303, 334,
+  304, 335,  305, 336,  306, 337,  307, 338,  308, 339,  309, 340,  310, 341,
+  311, 342,  312, 343,  313, 344,  314, 345,  315, 346,  316, 347,  317, 348,
+  318, 349,  319, 350,  320, 320,  321, 352,  322, 353,  323, 354,  324, 355,
+  325, 356,  326, 357,  327, 358,  328, 359,  329, 360,  330, 361,  331, 362,
+  332, 363,  333, 364,  334, 365,  335, 366,  336, 367,  337, 368,  338, 369,
+  339, 370,  340, 371,  341, 372,  342, 373,  343, 374,  344, 375,  345, 376,
+  346, 377,  347, 378,  348, 379,  349, 380,  350, 381,  351, 382,  352, 352,
+  353, 384,  354, 385,  355, 386,  356, 387,  357, 388,  358, 389,  359, 390,
+  360, 391,  361, 392,  362, 393,  363, 394,  364, 395,  365, 396,  366, 397,
+  367, 398,  368, 399,  369, 400,  370, 401,  371, 402,  372, 403,  373, 404,
+  374, 405,  375, 406,  376, 407,  377, 408,  378, 409,  379, 410,  380, 411,
+  381, 412,  382, 413,  383, 414,  384, 384,  385, 416,  386, 417,  387, 418,
+  388, 419,  389, 420,  390, 421,  391, 422,  392, 423,  393, 424,  394, 425,
+  395, 426,  396, 427,  397, 428,  398, 429,  399, 430,  400, 431,  401, 432,
+  402, 433,  403, 434,  404, 435,  405, 436,  406, 437,  407, 438,  408, 439,
+  409, 440,  410, 441,  411, 442,  412, 443,  413, 444,  414, 445,  415, 446,
+  416, 416,  417, 448,  418, 449,  419, 450,  420, 451,  421, 452,  422, 453,
+  423, 454,  424, 455,  425, 456,  426, 457,  427, 458,  428, 459,  429, 460,
+  430, 461,  431, 462,  432, 463,  433, 464,  434, 465,  435, 466,  436, 467,
+  437, 468,  438, 469,  439, 470,  440, 471,  441, 472,  442, 473,  443, 474,
+  444, 475,  445, 476,  446, 477,  447, 478,  448, 448,  449, 480,  450, 481,
+  451, 482,  452, 483,  453, 484,  454, 485,  455, 486,  456, 487,  457, 488,
+  458, 489,  459, 490,  460, 491,  461, 492,  462, 493,  463, 494,  464, 495,
+  465, 496,  466, 497,  467, 498,  468, 499,  469, 500,  470, 501,  471, 502,
+  472, 503,  473, 504,  474, 505,  475, 506,  476, 507,  477, 508,  478, 509,
+  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
+  486, 517,  487, 518,  488, 519,  489, 520,  490, 521,  491, 522,  492, 523,
+  493, 524,  494, 525,  495, 526,  496, 527,  497, 528,  498, 529,  499, 530,
+  500, 531,  501, 532,  502, 533,  503, 534,  504, 535,  505, 536,  506, 537,
+  507, 538,  508, 539,  509, 540,  510, 541,  511, 542,  512, 512,  513, 544,
+  514, 545,  515, 546,  516, 547,  517, 548,  518, 549,  519, 550,  520, 551,
+  521, 552,  522, 553,  523, 554,  524, 555,  525, 556,  526, 557,  527, 558,
+  528, 559,  529, 560,  530, 561,  531, 562,  532, 563,  533, 564,  534, 565,
+  535, 566,  536, 567,  537, 568,  538, 569,  539, 570,  540, 571,  541, 572,
+  542, 573,  543, 574,  544, 544,  545, 576,  546, 577,  547, 578,  548, 579,
+  549, 580,  550, 581,  551, 582,  552, 583,  553, 584,  554, 585,  555, 586,
+  556, 587,  557, 588,  558, 589,  559, 590,  560, 591,  561, 592,  562, 593,
+  563, 594,  564, 595,  565, 596,  566, 597,  567, 598,  568, 599,  569, 600,
+  570, 601,  571, 602,  572, 603,  573, 604,  574, 605,  575, 606,  576, 576,
+  577, 608,  578, 609,  579, 610,  580, 611,  581, 612,  582, 613,  583, 614,
+  584, 615,  585, 616,  586, 617,  587, 618,  588, 619,  589, 620,  590, 621,
+  591, 622,  592, 623,  593, 624,  594, 625,  595, 626,  596, 627,  597, 628,
+  598, 629,  599, 630,  600, 631,  601, 632,  602, 633,  603, 634,  604, 635,
+  605, 636,  606, 637,  607, 638,  608, 608,  609, 640,  610, 641,  611, 642,
+  612, 643,  613, 644,  614, 645,  615, 646,  616, 647,  617, 648,  618, 649,
+  619, 650,  620, 651,  621, 652,  622, 653,  623, 654,  624, 655,  625, 656,
+  626, 657,  627, 658,  628, 659,  629, 660,  630, 661,  631, 662,  632, 663,
+  633, 664,  634, 665,  635, 666,  636, 667,  637, 668,  638, 669,  639, 670,
+  640, 640,  641, 672,  642, 673,  643, 674,  644, 675,  645, 676,  646, 677,
+  647, 678,  648, 679,  649, 680,  650, 681,  651, 682,  652, 683,  653, 684,
+  654, 685,  655, 686,  656, 687,  657, 688,  658, 689,  659, 690,  660, 691,
+  661, 692,  662, 693,  663, 694,  664, 695,  665, 696,  666, 697,  667, 698,
+  668, 699,  669, 700,  670, 701,  671, 702,  672, 672,  673, 704,  674, 705,
+  675, 706,  676, 707,  677, 708,  678, 709,  679, 710,  680, 711,  681, 712,
+  682, 713,  683, 714,  684, 715,  685, 716,  686, 717,  687, 718,  688, 719,
+  689, 720,  690, 721,  691, 722,  692, 723,  693, 724,  694, 725,  695, 726,
+  696, 727,  697, 728,  698, 729,  699, 730,  700, 731,  701, 732,  702, 733,
+  703, 734,  704, 704,  705, 736,  706, 737,  707, 738,  708, 739,  709, 740,
+  710, 741,  711, 742,  712, 743,  713, 744,  714, 745,  715, 746,  716, 747,
+  717, 748,  718, 749,  719, 750,  720, 751,  721, 752,  722, 753,  723, 754,
+  724, 755,  725, 756,  726, 757,  727, 758,  728, 759,  729, 760,  730, 761,
+  731, 762,  732, 763,  733, 764,  734, 765,  735, 766,  736, 736,  737, 768,
+  738, 769,  739, 770,  740, 771,  741, 772,  742, 773,  743, 774,  744, 775,
+  745, 776,  746, 777,  747, 778,  748, 779,  749, 780,  750, 781,  751, 782,
+  752, 783,  753, 784,  754, 785,  755, 786,  756, 787,  757, 788,  758, 789,
+  759, 790,  760, 791,  761, 792,  762, 793,  763, 794,  764, 795,  765, 796,
+  766, 797,  767, 798,  768, 768,  769, 800,  770, 801,  771, 802,  772, 803,
+  773, 804,  774, 805,  775, 806,  776, 807,  777, 808,  778, 809,  779, 810,
+  780, 811,  781, 812,  782, 813,  783, 814,  784, 815,  785, 816,  786, 817,
+  787, 818,  788, 819,  789, 820,  790, 821,  791, 822,  792, 823,  793, 824,
+  794, 825,  795, 826,  796, 827,  797, 828,  798, 829,  799, 830,  800, 800,
+  801, 832,  802, 833,  803, 834,  804, 835,  805, 836,  806, 837,  807, 838,
+  808, 839,  809, 840,  810, 841,  811, 842,  812, 843,  813, 844,  814, 845,
+  815, 846,  816, 847,  817, 848,  818, 849,  819, 850,  820, 851,  821, 852,
+  822, 853,  823, 854,  824, 855,  825, 856,  826, 857,  827, 858,  828, 859,
+  829, 860,  830, 861,  831, 862,  832, 832,  833, 864,  834, 865,  835, 866,
+  836, 867,  837, 868,  838, 869,  839, 870,  840, 871,  841, 872,  842, 873,
+  843, 874,  844, 875,  845, 876,  846, 877,  847, 878,  848, 879,  849, 880,
+  850, 881,  851, 882,  852, 883,  853, 884,  854, 885,  855, 886,  856, 887,
+  857, 888,  858, 889,  859, 890,  860, 891,  861, 892,  862, 893,  863, 894,
+  864, 864,  865, 896,  866, 897,  867, 898,  868, 899,  869, 900,  870, 901,
+  871, 902,  872, 903,  873, 904,  874, 905,  875, 906,  876, 907,  877, 908,
+  878, 909,  879, 910,  880, 911,  881, 912,  882, 913,  883, 914,  884, 915,
+  885, 916,  886, 917,  887, 918,  888, 919,  889, 920,  890, 921,  891, 922,
+  892, 923,  893, 924,  894, 925,  895, 926,  896, 896,  897, 928,  898, 929,
+  899, 930,  900, 931,  901, 932,  902, 933,  903, 934,  904, 935,  905, 936,
+  906, 937,  907, 938,  908, 939,  909, 940,  910, 941,  911, 942,  912, 943,
+  913, 944,  914, 945,  915, 946,  916, 947,  917, 948,  918, 949,  919, 950,
+  920, 951,  921, 952,  922, 953,  923, 954,  924, 955,  925, 956,  926, 957,
+  927, 958,  928, 928,  929, 960,  930, 961,  931, 962,  932, 963,  933, 964,
+  934, 965,  935, 966,  936, 967,  937, 968,  938, 969,  939, 970,  940, 971,
+  941, 972,  942, 973,  943, 974,  944, 975,  945, 976,  946, 977,  947, 978,
+  948, 979,  949, 980,  950, 981,  951, 982,  952, 983,  953, 984,  954, 985,
+  955, 986,  956, 987,  957, 988,  958, 989,  959, 990,  960, 960,  961, 992,
+  962, 993,  963, 994,  964, 995,  965, 996,  966, 997,  967, 998,  968, 999,
+  969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006,
+  976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013,
+  983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
+  990, 1021, 991, 1022, 0,   0,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    64,  64,
-  33,  64,   2,   33,   96,  96,   2,   2,    65,  96,  34,  65,   128, 128,
-  97,  128,  3,   34,   66,  97,   3,   3,    35,  66,  98,  129,  129, 160,
-  160, 160,  4,   35,   67,  98,   192, 192,  4,   4,   130, 161,  161, 192,
-  36,  67,   99,  130,  5,   36,   68,  99,   193, 224, 162, 193,  224, 224,
-  131, 162,  37,  68,   100, 131,  5,   5,    194, 225, 225, 256,  256, 256,
-  163, 194,  69,  100,  132, 163,  6,   37,   226, 257, 6,   6,    195, 226,
-  257, 288,  101, 132,  288, 288,  38,  69,   164, 195, 133, 164,  258, 289,
-  227, 258,  196, 227,  7,   38,   289, 320,  70,  101, 320, 320,  7,   7,
+  0,   0,    0,   0,    32,  0,    32,  32,   1,   32,  33,  1,    64,  64,
+  33,  64,   2,   33,   96,  96,   34,  2,    65,  96,  34,  65,   128, 128,
+  97,  128,  3,   34,   66,  97,   35,  3,    35,  66,  98,  129,  129, 160,
+  160, 161,  4,   35,   67,  98,   192, 192,  36,  4,   130, 161,  161, 192,
+  36,  67,   99,  130,  5,   36,   68,  99,   193, 224, 162, 193,  224, 225,
+  131, 162,  37,  68,   100, 131,  37,  5,    194, 225, 225, 256,  256, 257,
+  163, 194,  69,  100,  132, 163,  6,   37,   226, 257, 38,  6,    195, 226,
+  257, 288,  101, 132,  288, 289,  38,  69,   164, 195, 133, 164,  258, 289,
+  227, 258,  196, 227,  7,   38,   289, 320,  70,  101, 320, 321,  39,  7,
   165, 196,  39,  70,   102, 133,  290, 321,  259, 290, 228, 259,  321, 352,
-  352, 352,  197, 228,  134, 165,  71,  102,  8,   39,  322, 353,  291, 322,
-  260, 291,  103, 134,  353, 384,  166, 197,  229, 260, 40,  71,   8,   8,
-  384, 384,  135, 166,  354, 385,  323, 354,  198, 229, 292, 323,  72,  103,
+  352, 353,  197, 228,  134, 165,  71,  102,  8,   39,  322, 353,  291, 322,
+  260, 291,  103, 134,  353, 384,  166, 197,  229, 260, 40,  71,   40,  8,
+  384, 385,  135, 166,  354, 385,  323, 354,  198, 229, 292, 323,  72,  103,
   261, 292,  9,   40,   385, 416,  167, 198,  104, 135, 230, 261,  355, 386,
-  416, 416,  293, 324,  324, 355,  9,   9,    41,  72,  386, 417,  199, 230,
+  416, 417,  293, 324,  324, 355,  41,  9,    41,  72,  386, 417,  199, 230,
   136, 167,  417, 448,  262, 293,  356, 387,  73,  104, 387, 418,  231, 262,
-  10,  41,   168, 199,  325, 356,  418, 449,  105, 136, 448, 448,  42,  73,
-  294, 325,  200, 231,  10,  10,   357, 388,  137, 168, 263, 294,  388, 419,
+  10,  41,   168, 199,  325, 356,  418, 449,  105, 136, 448, 449,  42,  73,
+  294, 325,  200, 231,  42,  10,   357, 388,  137, 168, 263, 294,  388, 419,
   74,  105,  419, 450,  449, 480,  326, 357,  232, 263, 295, 326,  169, 200,
-  11,  42,   106, 137,  480, 480,  450, 481,  358, 389, 264, 295,  201, 232,
-  138, 169,  389, 420,  43,  74,   420, 451,  327, 358, 11,  11,   481, 512,
-  233, 264,  451, 482,  296, 327,  75,  106,  170, 201, 482, 513,  512, 512,
+  11,  42,   106, 137,  480, 481,  450, 481,  358, 389, 264, 295,  201, 232,
+  138, 169,  389, 420,  43,  74,   420, 451,  327, 358, 43,  11,   481, 512,
+  233, 264,  451, 482,  296, 327,  75,  106,  170, 201, 482, 513,  512, 513,
   390, 421,  359, 390,  421, 452,  107, 138,  12,  43,  202, 233,  452, 483,
   265, 296,  328, 359,  139, 170,  44,  75,   483, 514, 513, 544,  234, 265,
-  297, 328,  422, 453,  12,  12,   391, 422,  171, 202, 76,  107,  514, 545,
-  453, 484,  544, 544,  266, 297,  203, 234,  108, 139, 329, 360,  298, 329,
+  297, 328,  422, 453,  44,  12,   391, 422,  171, 202, 76,  107,  514, 545,
+  453, 484,  544, 545,  266, 297,  203, 234,  108, 139, 329, 360,  298, 329,
   140, 171,  515, 546,  13,  44,   423, 454,  235, 266, 545, 576,  454, 485,
-  45,  76,   172, 203,  330, 361,  576, 576,  13,  13,  267, 298,  546, 577,
+  45,  76,   172, 203,  330, 361,  576, 577,  45,  13,  267, 298,  546, 577,
   77,  108,  204, 235,  455, 486,  577, 608,  299, 330, 109, 140,  547, 578,
-  14,  45,   14,  14,   141, 172,  578, 609,  331, 362, 46,  77,   173, 204,
+  14,  45,   46,  14,   141, 172,  578, 609,  331, 362, 46,  77,   173, 204,
   15,  15,   78,  109,  205, 236,  579, 610,  110, 141, 15,  46,   142, 173,
   47,  78,   174, 205,  16,  16,   79,  110,  206, 237, 16,  47,   111, 142,
-  48,  79,   143, 174,  80,  111,  175, 206,  17,  48,  17,  17,   207, 238,
+  48,  79,   143, 174,  80,  111,  175, 206,  17,  48,  49,  17,   207, 238,
   49,  80,   81,  112,  18,  18,   18,  49,   50,  81,  82,  113,  19,  50,
-  51,  82,   83,  114,  608, 608,  484, 515,  360, 391, 236, 267,  112, 143,
-  19,  19,   640, 640,  609, 640,  516, 547,  485, 516, 392, 423,  361, 392,
-  268, 299,  237, 268,  144, 175,  113, 144,  20,  51,  20,  20,   672, 672,
+  51,  82,   83,  114,  608, 609,  484, 515,  360, 391, 236, 267,  112, 143,
+  51,  19,   640, 640,  609, 640,  516, 547,  485, 516, 392, 423,  361, 392,
+  268, 299,  237, 268,  144, 175,  113, 144,  20,  51,  52,  20,   672, 672,
   641, 672,  610, 641,  548, 579,  517, 548,  486, 517, 424, 455,  393, 424,
   362, 393,  300, 331,  269, 300,  238, 269,  176, 207, 145, 176,  114, 145,
-  52,  83,   21,  52,   21,  21,   704, 704,  673, 704, 642, 673,  611, 642,
+  52,  83,   21,  52,   53,  21,   704, 704,  673, 704, 642, 673,  611, 642,
   580, 611,  549, 580,  518, 549,  487, 518,  456, 487, 425, 456,  394, 425,
   363, 394,  332, 363,  301, 332,  270, 301,  239, 270, 208, 239,  177, 208,
-  146, 177,  115, 146,  84,  115,  53,  84,   22,  53,  22,  22,   705, 736,
+  146, 177,  115, 146,  84,  115,  53,  84,   22,  53,  54,  22,   705, 736,
   674, 705,  643, 674,  581, 612,  550, 581,  519, 550, 457, 488,  426, 457,
   395, 426,  333, 364,  302, 333,  271, 302,  209, 240, 178, 209,  147, 178,
   85,  116,  54,  85,   23,  54,   706, 737,  675, 706, 582, 613,  551, 582,
   458, 489,  427, 458,  334, 365,  303, 334,  210, 241, 179, 210,  86,  117,
   55,  86,   707, 738,  583, 614,  459, 490,  335, 366, 211, 242,  87,  118,
-  736, 736,  612, 643,  488, 519,  364, 395,  240, 271, 116, 147,  23,  23,
+  736, 737,  612, 643,  488, 519,  364, 395,  240, 271, 116, 147,  55,  23,
   768, 768,  737, 768,  644, 675,  613, 644,  520, 551, 489, 520,  396, 427,
-  365, 396,  272, 303,  241, 272,  148, 179,  117, 148, 24,  55,   24,  24,
+  365, 396,  272, 303,  241, 272,  148, 179,  117, 148, 24,  55,   56,  24,
   800, 800,  769, 800,  738, 769,  676, 707,  645, 676, 614, 645,  552, 583,
   521, 552,  490, 521,  428, 459,  397, 428,  366, 397, 304, 335,  273, 304,
-  242, 273,  180, 211,  149, 180,  118, 149,  56,  87,  25,  56,   25,  25,
+  242, 273,  180, 211,  149, 180,  118, 149,  56,  87,  25,  56,   57,  25,
   832, 832,  801, 832,  770, 801,  739, 770,  708, 739, 677, 708,  646, 677,
   615, 646,  584, 615,  553, 584,  522, 553,  491, 522, 460, 491,  429, 460,
   398, 429,  367, 398,  336, 367,  305, 336,  274, 305, 243, 274,  212, 243,
-  181, 212,  150, 181,  119, 150,  88,  119,  57,  88,  26,  57,   26,  26,
+  181, 212,  150, 181,  119, 150,  88,  119,  57,  88,  26,  57,   58,  26,
   833, 864,  802, 833,  771, 802,  709, 740,  678, 709, 647, 678,  585, 616,
   554, 585,  523, 554,  461, 492,  430, 461,  399, 430, 337, 368,  306, 337,
   275, 306,  213, 244,  182, 213,  151, 182,  89,  120, 58,  89,   27,  58,
   834, 865,  803, 834,  710, 741,  679, 710,  586, 617, 555, 586,  462, 493,
   431, 462,  338, 369,  307, 338,  214, 245,  183, 214, 90,  121,  59,  90,
   835, 866,  711, 742,  587, 618,  463, 494,  339, 370, 215, 246,  91,  122,
-  864, 864,  740, 771,  616, 647,  492, 523,  368, 399, 244, 275,  120, 151,
-  27,  27,   896, 896,  865, 896,  772, 803,  741, 772, 648, 679,  617, 648,
+  864, 865,  740, 771,  616, 647,  492, 523,  368, 399, 244, 275,  120, 151,
+  59,  27,   896, 896,  865, 896,  772, 803,  741, 772, 648, 679,  617, 648,
   524, 555,  493, 524,  400, 431,  369, 400,  276, 307, 245, 276,  152, 183,
-  121, 152,  28,  59,   28,  28,   928, 928,  897, 928, 866, 897,  804, 835,
+  121, 152,  28,  59,   60,  28,   928, 928,  897, 928, 866, 897,  804, 835,
   773, 804,  742, 773,  680, 711,  649, 680,  618, 649, 556, 587,  525, 556,
   494, 525,  432, 463,  401, 432,  370, 401,  308, 339, 277, 308,  246, 277,
-  184, 215,  153, 184,  122, 153,  60,  91,   29,  60,  29,  29,   960, 960,
+  184, 215,  153, 184,  122, 153,  60,  91,   29,  60,  61,  29,   960, 960,
   929, 960,  898, 929,  867, 898,  836, 867,  805, 836, 774, 805,  743, 774,
   712, 743,  681, 712,  650, 681,  619, 650,  588, 619, 557, 588,  526, 557,
   495, 526,  464, 495,  433, 464,  402, 433,  371, 402, 340, 371,  309, 340,
   278, 309,  247, 278,  216, 247,  185, 216,  154, 185, 123, 154,  92,  123,
-  61,  92,   30,  61,   30,  30,   961, 992,  930, 961, 899, 930,  837, 868,
+  61,  92,   30,  61,   62,  30,   961, 992,  930, 961, 899, 930,  837, 868,
   806, 837,  775, 806,  713, 744,  682, 713,  651, 682, 589, 620,  558, 589,
   527, 558,  465, 496,  434, 465,  403, 434,  341, 372, 310, 341,  279, 310,
   217, 248,  186, 217,  155, 186,  93,  124,  62,  93,  31,  62,   962, 993,
@@ -515,10 +2372,475 @@
   959, 990,  991, 1022, 0,   0,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                v2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,   32,  32,   2,   33,
+  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,  65,  96,   35,  66,
+  66,  97,   3,   3,    96,  96,   4,   35,   97,  128, 67,  98,   36,  67,
+  98,  129,  4,   4,    68,  99,   99,  130,  128, 128, 5,   36,   129, 160,
+  37,  68,   130, 161,  100, 131,  69,  100,  131, 162, 5,   5,    160, 160,
+  6,   37,   161, 192,  38,  69,   162, 193,  101, 132, 132, 163,  70,  101,
+  163, 194,  6,   6,    192, 192,  7,   38,   133, 164, 193, 224,  102, 133,
+  164, 195,  39,  70,   194, 225,  71,  102,  195, 226, 134, 165,  165, 196,
+  7,   7,    224, 224,  8,   39,   103, 134,  196, 227, 225, 256,  40,  71,
+  226, 257,  166, 197,  72,  103,  227, 258,  135, 166, 197, 228,  104, 135,
+  228, 259,  8,   8,    256, 256,  9,   40,   257, 288, 41,  72,   167, 198,
+  198, 229,  258, 289,  136, 167,  229, 260,  73,  104, 259, 290,  105, 136,
+  260, 291,  199, 230,  9,   9,    168, 199,  230, 261, 288, 288,  10,  41,
+  289, 320,  42,  73,   290, 321,  137, 168,  261, 292, 74,  105,  291, 322,
+  200, 231,  231, 262,  106, 137,  292, 323,  169, 200, 262, 293,  10,  10,
+  320, 320,  11,  42,   321, 352,  43,  74,   138, 169, 293, 324,  322, 353,
+  232, 263,  75,  106,  201, 232,  263, 294,  323, 354, 170, 201,  294, 325,
+  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,  233, 264,  264, 295,
+  353, 384,  139, 170,  325, 356,  44,  75,   354, 385, 202, 233,  295, 326,
+  76,  107,  355, 386,  171, 202,  326, 357,  108, 139, 356, 387,  265, 296,
+  234, 265,  296, 327,  12,  12,   140, 171,  357, 388, 384, 384,  13,  44,
+  203, 234,  327, 358,  385, 416,  45,  76,   386, 417, 77,  108,  387, 418,
+  172, 203,  358, 389,  266, 297,  297, 328,  109, 140, 235, 266,  328, 359,
+  388, 419,  204, 235,  359, 390,  141, 172,  389, 420, 13,  13,   416, 416,
+  14,  45,   417, 448,  46,  77,   298, 329,  418, 449, 267, 298,  329, 360,
+  78,  109,  173, 204,  390, 421,  419, 450,  236, 267, 360, 391,  110, 141,
+  420, 451,  205, 236,  391, 422,  142, 173,  299, 330, 330, 361,  421, 452,
+  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,  449, 480,  47,  78,
+  450, 481,  174, 205,  422, 453,  237, 268,  392, 423, 79,  110,  451, 482,
+  111, 142,  452, 483,  331, 362,  300, 331,  362, 393, 206, 237,  423, 454,
+  143, 174,  269, 300,  393, 424,  453, 484,  480, 480, 481, 512,  238, 269,
+  424, 455,  482, 513,  175, 206,  454, 485,  332, 363, 363, 394,  483, 514,
+  301, 332,  394, 425,  484, 515,  207, 238,  455, 486, 270, 301,  425, 456,
+  485, 516,  364, 395,  239, 270,  456, 487,  512, 512, 333, 364,  395, 426,
+  513, 544,  486, 517,  514, 545,  302, 333,  426, 457, 515, 546,  487, 518,
+  516, 547,  271, 302,  457, 488,  365, 396,  396, 427, 517, 548,  334, 365,
+  427, 458,  488, 519,  544, 544,  303, 334,  458, 489, 518, 549,  545, 576,
+  546, 577,  547, 578,  489, 520,  397, 428,  519, 550, 366, 397,  428, 459,
+  548, 579,  335, 366,  459, 490,  549, 580,  520, 551, 490, 521,  550, 581,
+  576, 576,  577, 608,  398, 429,  429, 460,  578, 609, 367, 398,  460, 491,
+  521, 552,  579, 610,  551, 582,  491, 522,  580, 611, 581, 612,  552, 583,
+  522, 553,  430, 461,  399, 430,  461, 492,  582, 613, 492, 523,  608, 608,
+  609, 640,  610, 641,  553, 584,  611, 642,  523, 554, 583, 614,  612, 643,
+  431, 462,  462, 493,  554, 585,  493, 524,  584, 615, 613, 644,  524, 555,
+  614, 645,  640, 640,  585, 616,  641, 672,  555, 586, 642, 673,  615, 646,
+  463, 494,  643, 674,  494, 525,  644, 675,  525, 556, 586, 617,  616, 647,
+  645, 676,  556, 587,  646, 677,  495, 526,  617, 648, 587, 618,  672, 672,
+  526, 557,  673, 704,  674, 705,  647, 678,  557, 588, 675, 706,  618, 649,
+  676, 707,  588, 619,  648, 679,  677, 708,  527, 558, 558, 589,  678, 709,
+  619, 650,  649, 680,  704, 704,  589, 620,  705, 736, 679, 710,  706, 737,
+  707, 738,  650, 681,  620, 651,  708, 739,  680, 711, 559, 590,  709, 740,
+  590, 621,  651, 682,  681, 712,  710, 741,  621, 652, 736, 736,  737, 768,
+  711, 742,  738, 769,  682, 713,  652, 683,  739, 770, 591, 622,  740, 771,
+  712, 743,  622, 653,  741, 772,  683, 714,  653, 684, 713, 744,  742, 773,
+  623, 654,  743, 774,  768, 768,  769, 800,  684, 715, 714, 745,  770, 801,
+  771, 802,  654, 685,  744, 775,  772, 803,  715, 746, 773, 804,  685, 716,
+  745, 776,  774, 805,  655, 686,  716, 747,  775, 806, 746, 777,  800, 800,
+  801, 832,  686, 717,  802, 833,  803, 834,  776, 807, 804, 835,  747, 778,
+  717, 748,  805, 836,  777, 808,  687, 718,  806, 837, 748, 779,  718, 749,
+  778, 809,  807, 838,  832, 832,  833, 864,  834, 865, 835, 866,  808, 839,
+  749, 780,  836, 867,  779, 810,  719, 750,  837, 868, 809, 840,  838, 869,
+  780, 811,  750, 781,  810, 841,  839, 870,  864, 864, 865, 896,  866, 897,
+  840, 871,  867, 898,  781, 812,  811, 842,  868, 899, 751, 782,  869, 900,
+  841, 872,  812, 843,  870, 901,  782, 813,  842, 873, 871, 902,  896, 896,
+  897, 928,  813, 844,  898, 929,  872, 903,  783, 814, 843, 874,  899, 930,
+  900, 931,  873, 904,  901, 932,  814, 845,  844, 875, 902, 933,  874, 905,
+  903, 934,  845, 876,  928, 928,  815, 846,  929, 960, 930, 961,  875, 906,
+  904, 935,  931, 962,  932, 963,  905, 936,  846, 877, 933, 964,  876, 907,
+  934, 965,  906, 937,  935, 966,  877, 908,  847, 878, 960, 960,  907, 938,
+  961, 992,  936, 967,  962, 993,  963, 994,  964, 995, 878, 909,  937, 968,
+  908, 939,  965, 996,  966, 997,  938, 969,  879, 910, 909, 940,  967, 998,
+  939, 970,  968, 999,  910, 941,  969, 1000, 940, 971, 970, 1001, 911, 942,
+  941, 972,  971, 1002, 942, 973,  972, 1003, 943, 974, 973, 1004, 974, 1005,
+  975, 1006, 15,  15,   16,  47,   48,  79,   80,  111, 112, 143,  144, 175,
+  16,  16,   17,  48,   176, 207,  49,  80,   81,  112, 113, 144,  208, 239,
+  145, 176,  240, 271,  17,  17,   18,  49,   177, 208, 50,  81,   82,  113,
+  272, 303,  209, 240,  114, 145,  146, 177,  241, 272, 304, 335,  178, 209,
+  18,  18,   19,  50,   51,  82,   83,  114,  273, 304, 210, 241,  115, 146,
+  336, 367,  147, 178,  242, 273,  305, 336,  179, 210, 19,  19,   368, 399,
+  20,  51,   52,  83,   274, 305,  84,  115,  211, 242, 337, 368,  116, 147,
+  306, 337,  148, 179,  243, 274,  400, 431,  369, 400, 180, 211,  20,  20,
+  21,  52,   275, 306,  53,  84,   338, 369,  212, 243, 85,  116,  432, 463,
+  117, 148,  401, 432,  307, 338,  244, 275,  149, 180, 370, 401,  181, 212,
+  276, 307,  464, 495,  339, 370,  21,  21,   22,  53,  433, 464,  54,  85,
+  213, 244,  86,  117,  402, 433,  118, 149,  308, 339, 245, 276,  371, 402,
+  150, 181,  496, 527,  465, 496,  182, 213,  434, 465, 340, 371,  277, 308,
+  22,  22,   23,  54,   403, 434,  55,  86,   214, 245, 87,  118,  309, 340,
+  372, 403,  119, 150,  497, 528,  528, 559,  246, 277, 466, 497,  151, 182,
+  435, 466,  341, 372,  183, 214,  278, 309,  404, 435, 23,  23,   24,  55,
+  215, 246,  529, 560,  56,  87,   498, 529,  560, 591, 310, 341,  88,  119,
+  373, 404,  467, 498,  120, 151,  247, 278,  436, 467, 152, 183,  342, 373,
+  279, 310,  405, 436,  184, 215,  530, 561,  561, 592, 499, 530,  592, 623,
+  24,  24,   216, 247,  468, 499,  25,  56,   374, 405, 57,  88,   311, 342,
+  89,  120,  437, 468,  248, 279,  121, 152,  562, 593, 153, 184,  343, 374,
+  531, 562,  593, 624,  406, 437,  500, 531,  624, 655, 280, 311,  185, 216,
+  469, 500,  375, 406,  217, 248,  25,  25,   312, 343, 26,  57,   58,  89,
+  438, 469,  90,  121,  563, 594,  594, 625,  249, 280, 532, 563,  625, 656,
+  122, 153,  344, 375,  501, 532,  656, 687,  407, 438, 154, 185,  281, 312,
+  470, 501,  186, 217,  376, 407,  595, 626,  564, 595, 626, 657,  218, 249,
+  313, 344,  439, 470,  26,  26,   27,  58,   533, 564, 657, 688,  59,  90,
+  91,  122,  250, 281,  502, 533,  688, 719,  123, 154, 408, 439,  345, 376,
+  155, 186,  471, 502,  282, 313,  596, 627,  627, 658, 187, 218,  565, 596,
+  658, 689,  377, 408,  440, 471,  534, 565,  689, 720, 314, 345,  219, 250,
+  27,  27,   28,  59,   503, 534,  720, 751,  60,  91,  92,  123,  251, 282,
+  409, 440,  346, 377,  124, 155,  628, 659,  472, 503, 597, 628,  659, 690,
+  566, 597,  690, 721,  156, 187,  283, 314,  535, 566, 721, 752,  188, 219,
+  378, 409,  441, 472,  315, 346,  504, 535,  752, 783, 220, 251,  28,  28,
+  629, 660,  660, 691,  29,  60,   61,  92,   410, 441, 598, 629,  691, 722,
+  252, 283,  93,  124,  347, 378,  473, 504,  567, 598, 722, 753,  125, 156,
+  284, 315,  536, 567,  753, 784,  157, 188,  442, 473, 379, 410,  189, 220,
+  505, 536,  784, 815,  661, 692,  316, 347,  630, 661, 692, 723,  221, 252,
+  599, 630,  723, 754,  411, 442,  29,  29,   568, 599, 754, 785,  30,  61,
+  474, 505,  62,  93,   253, 284,  348, 379,  94,  125, 537, 568,  785, 816,
+  126, 157,  285, 316,  158, 189,  443, 474,  662, 693, 693, 724,  380, 411,
+  631, 662,  724, 755,  506, 537,  816, 847,  190, 221, 600, 631,  755, 786,
+  317, 348,  222, 253,  569, 600,  786, 817,  412, 443, 475, 506,  30,  30,
+  31,  62,   349, 380,  254, 285,  63,  94,   538, 569, 817, 848,  694, 725,
+  95,  126,  663, 694,  725, 756,  632, 663,  756, 787, 127, 158,  444, 475,
+  286, 317,  381, 412,  507, 538,  848, 879,  159, 190, 601, 632,  787, 818,
+  191, 222,  318, 349,  570, 601,  818, 849,  476, 507, 223, 254,  413, 444,
+  695, 726,  726, 757,  664, 695,  757, 788,  539, 570, 849, 880,  350, 381,
+  255, 286,  633, 664,  788, 819,  445, 476,  602, 633, 819, 850,  508, 539,
+  880, 911,  287, 318,  382, 413,  571, 602,  850, 881, 727, 758,  696, 727,
+  758, 789,  319, 350,  477, 508,  665, 696,  789, 820, 414, 445,  540, 571,
+  881, 912,  634, 665,  820, 851,  351, 382,  603, 634, 851, 882,  446, 477,
+  509, 540,  912, 943,  383, 414,  728, 759,  759, 790, 572, 603,  882, 913,
+  697, 728,  790, 821,  666, 697,  821, 852,  478, 509, 635, 666,  852, 883,
+  415, 446,  541, 572,  913, 944,  604, 635,  883, 914, 760, 791,  729, 760,
+  791, 822,  510, 541,  944, 975,  447, 478,  698, 729, 822, 853,  573, 604,
+  914, 945,  667, 698,  853, 884,  636, 667,  884, 915, 479, 510,  542, 573,
+  945, 976,  761, 792,  792, 823,  605, 636,  915, 946, 730, 761,  823, 854,
+  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605, 946, 977,  668, 699,
+  885, 916,  637, 668,  916, 947,  543, 574,  793, 824, 977, 1008, 762, 793,
+  824, 855,  731, 762,  855, 886,  606, 637,  947, 978, 700, 731,  886, 917,
+  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669, 948, 979,  794, 825,
+  825, 856,  763, 794,  856, 887,  732, 763,  887, 918, 607, 638,  979, 1010,
+  701, 732,  918, 949,  670, 701,  949, 980,  826, 857, 795, 826,  857, 888,
+  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764, 919, 950,  702, 733,
+  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889, 796, 827,  889, 920,
+  765, 796,  920, 951,  734, 765,  951, 982,  703, 734, 982, 1013, 859, 890,
+  828, 859,  890, 921,  797, 828,  921, 952,  766, 797, 952, 983,  735, 766,
+  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953, 798, 829,  953, 984,
+  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954, 830, 861,  954, 985,
+  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893, 955, 986,  831, 862,
+  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894, 987, 1018, 926, 957,
+  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958, 989, 1020, 959, 990,
+  990, 1021, 991, 1022, 0,   0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                h2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
+  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
+  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
+  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
+  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
+  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
+  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
+  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
+  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
+  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
+  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
+  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
+  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
+  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
+  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
+  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
+  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
+  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
+  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
+  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
+  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
+  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
+  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
+  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
+  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
+  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
+  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
+  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
+  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
+  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
+  143, 174,  269, 300,  393, 424,  453, 484,  15,  15,   16,  47,   48,  79,
+  238, 269,  424, 455,  175, 206,  454, 485,  80,  111,  332, 363,  363, 394,
+  301, 332,  394, 425,  112, 143,  207, 238,  455, 486,  270, 301,  425, 456,
+  144, 175,  364, 395,  16,  16,   239, 270,  456, 487,  17,  48,   333, 364,
+  395, 426,  176, 207,  49,  80,   302, 333,  426, 457,  81,  112,  113, 144,
+  208, 239,  271, 302,  457, 488,  365, 396,  396, 427,  145, 176,  334, 365,
+  427, 458,  240, 271,  17,  17,   18,  49,   177, 208,  303, 334,  458, 489,
+  50,  81,   82,  113,  272, 303,  209, 240,  397, 428,  114, 145,  366, 397,
+  428, 459,  335, 366,  459, 490,  146, 177,  241, 272,  304, 335,  178, 209,
+  18,  18,   19,  50,   51,  82,   398, 429,  429, 460,  367, 398,  460, 491,
+  83,  114,  273, 304,  210, 241,  115, 146,  336, 367,  147, 178,  242, 273,
+  305, 336,  430, 461,  399, 430,  461, 492,  179, 210,  19,  19,   368, 399,
+  20,  51,   52,  83,   274, 305,  84,  115,  211, 242,  337, 368,  116, 147,
+  431, 462,  462, 493,  306, 337,  148, 179,  243, 274,  400, 431,  369, 400,
+  180, 211,  20,  20,   21,  52,   275, 306,  53,  84,   338, 369,  212, 243,
+  85,  116,  463, 494,  432, 463,  117, 148,  401, 432,  307, 338,  244, 275,
+  149, 180,  370, 401,  181, 212,  276, 307,  464, 495,  339, 370,  21,  21,
+  22,  53,   433, 464,  54,  85,   213, 244,  86,  117,  402, 433,  118, 149,
+  308, 339,  245, 276,  371, 402,  150, 181,  465, 496,  182, 213,  434, 465,
+  340, 371,  277, 308,  22,  22,   23,  54,   403, 434,  55,  86,   214, 245,
+  87,  118,  309, 340,  372, 403,  119, 150,  246, 277,  466, 497,  151, 182,
+  435, 466,  341, 372,  183, 214,  278, 309,  404, 435,  23,  23,   24,  55,
+  215, 246,  56,  87,   310, 341,  88,  119,  373, 404,  467, 498,  120, 151,
+  247, 278,  436, 467,  152, 183,  342, 373,  279, 310,  405, 436,  184, 215,
+  24,  24,   216, 247,  468, 499,  25,  56,   374, 405,  57,  88,   311, 342,
+  89,  120,  437, 468,  248, 279,  121, 152,  153, 184,  343, 374,  406, 437,
+  280, 311,  185, 216,  469, 500,  375, 406,  217, 248,  25,  25,   312, 343,
+  26,  57,   58,  89,   438, 469,  90,  121,  249, 280,  122, 153,  344, 375,
+  407, 438,  154, 185,  281, 312,  470, 501,  186, 217,  376, 407,  218, 249,
+  313, 344,  439, 470,  26,  26,   27,  58,   59,  90,   91,  122,  250, 281,
+  123, 154,  408, 439,  345, 376,  155, 186,  471, 502,  282, 313,  187, 218,
+  377, 408,  440, 471,  314, 345,  219, 250,  27,  27,   28,  59,   60,  91,
+  92,  123,  251, 282,  409, 440,  346, 377,  124, 155,  472, 503,  156, 187,
+  283, 314,  188, 219,  378, 409,  441, 472,  315, 346,  220, 251,  28,  28,
+  29,  60,   61,  92,   410, 441,  252, 283,  93,  124,  347, 378,  473, 504,
+  125, 156,  284, 315,  157, 188,  442, 473,  379, 410,  189, 220,  316, 347,
+  221, 252,  411, 442,  29,  29,   30,  61,   474, 505,  62,  93,   253, 284,
+  348, 379,  94,  125,  126, 157,  285, 316,  158, 189,  443, 474,  380, 411,
+  190, 221,  317, 348,  222, 253,  412, 443,  475, 506,  30,  30,   31,  62,
+  349, 380,  254, 285,  63,  94,   95,  126,  127, 158,  444, 475,  286, 317,
+  381, 412,  159, 190,  191, 222,  318, 349,  476, 507,  223, 254,  413, 444,
+  350, 381,  255, 286,  445, 476,  287, 318,  382, 413,  319, 350,  477, 508,
+  414, 445,  351, 382,  446, 477,  383, 414,  478, 509,  415, 446,  447, 478,
+  479, 510,  480, 480,  481, 512,  482, 513,  483, 514,  484, 515,  485, 516,
+  512, 512,  513, 544,  486, 517,  514, 545,  515, 546,  487, 518,  516, 547,
+  517, 548,  488, 519,  544, 544,  518, 549,  545, 576,  546, 577,  547, 578,
+  489, 520,  519, 550,  548, 579,  549, 580,  520, 551,  490, 521,  550, 581,
+  576, 576,  577, 608,  578, 609,  521, 552,  579, 610,  551, 582,  491, 522,
+  580, 611,  581, 612,  552, 583,  522, 553,  582, 613,  492, 523,  608, 608,
+  609, 640,  610, 641,  553, 584,  611, 642,  523, 554,  583, 614,  612, 643,
+  554, 585,  493, 524,  584, 615,  613, 644,  524, 555,  614, 645,  640, 640,
+  585, 616,  641, 672,  555, 586,  642, 673,  615, 646,  643, 674,  494, 525,
+  644, 675,  525, 556,  586, 617,  616, 647,  645, 676,  556, 587,  646, 677,
+  495, 526,  617, 648,  587, 618,  672, 672,  526, 557,  673, 704,  674, 705,
+  647, 678,  557, 588,  675, 706,  618, 649,  676, 707,  588, 619,  648, 679,
+  677, 708,  496, 527,  527, 558,  558, 589,  678, 709,  619, 650,  649, 680,
+  704, 704,  589, 620,  705, 736,  679, 710,  706, 737,  707, 738,  650, 681,
+  620, 651,  497, 528,  528, 559,  708, 739,  680, 711,  559, 590,  709, 740,
+  590, 621,  651, 682,  681, 712,  710, 741,  621, 652,  736, 736,  737, 768,
+  529, 560,  711, 742,  498, 529,  560, 591,  738, 769,  682, 713,  652, 683,
+  739, 770,  591, 622,  740, 771,  712, 743,  622, 653,  741, 772,  683, 714,
+  653, 684,  713, 744,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,
+  623, 654,  743, 774,  768, 768,  769, 800,  684, 715,  714, 745,  770, 801,
+  771, 802,  654, 685,  744, 775,  772, 803,  562, 593,  531, 562,  593, 624,
+  715, 746,  773, 804,  685, 716,  500, 531,  624, 655,  745, 776,  774, 805,
+  655, 686,  716, 747,  775, 806,  746, 777,  800, 800,  801, 832,  686, 717,
+  802, 833,  563, 594,  594, 625,  803, 834,  532, 563,  625, 656,  776, 807,
+  804, 835,  501, 532,  656, 687,  747, 778,  717, 748,  805, 836,  777, 808,
+  687, 718,  806, 837,  748, 779,  595, 626,  564, 595,  626, 657,  718, 749,
+  778, 809,  807, 838,  832, 832,  533, 564,  657, 688,  833, 864,  834, 865,
+  835, 866,  502, 533,  688, 719,  808, 839,  749, 780,  836, 867,  779, 810,
+  719, 750,  837, 868,  809, 840,  596, 627,  627, 658,  565, 596,  658, 689,
+  838, 869,  780, 811,  750, 781,  534, 565,  689, 720,  810, 841,  839, 870,
+  864, 864,  503, 534,  720, 751,  865, 896,  866, 897,  840, 871,  867, 898,
+  781, 812,  811, 842,  628, 659,  868, 899,  751, 782,  597, 628,  659, 690,
+  566, 597,  690, 721,  869, 900,  841, 872,  535, 566,  721, 752,  812, 843,
+  870, 901,  782, 813,  842, 873,  504, 535,  752, 783,  871, 902,  629, 660,
+  660, 691,  896, 896,  897, 928,  598, 629,  691, 722,  813, 844,  898, 929,
+  872, 903,  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  900, 931,
+  536, 567,  753, 784,  873, 904,  901, 932,  814, 845,  844, 875,  902, 933,
+  505, 536,  784, 815,  661, 692,  630, 661,  692, 723,  874, 905,  599, 630,
+  723, 754,  903, 934,  845, 876,  568, 599,  754, 785,  928, 928,  815, 846,
+  929, 960,  930, 961,  875, 906,  904, 935,  931, 962,  537, 568,  785, 816,
+  932, 963,  905, 936,  662, 693,  693, 724,  846, 877,  933, 964,  876, 907,
+  631, 662,  724, 755,  506, 537,  816, 847,  934, 965,  600, 631,  755, 786,
+  906, 937,  569, 600,  786, 817,  935, 966,  877, 908,  847, 878,  960, 960,
+  907, 938,  961, 992,  936, 967,  538, 569,  817, 848,  962, 993,  694, 725,
+  663, 694,  725, 756,  963, 994,  632, 663,  756, 787,  964, 995,  878, 909,
+  937, 968,  507, 538,  848, 879,  908, 939,  601, 632,  787, 818,  965, 996,
+  966, 997,  570, 601,  818, 849,  938, 969,  879, 910,  909, 940,  967, 998,
+  695, 726,  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  939, 970,
+  633, 664,  788, 819,  968, 999,  602, 633,  819, 850,  910, 941,  508, 539,
+  880, 911,  969, 1000, 940, 971,  571, 602,  850, 881,  727, 758,  696, 727,
+  758, 789,  970, 1001, 665, 696,  789, 820,  911, 942,  941, 972,  540, 571,
+  881, 912,  634, 665,  820, 851,  971, 1002, 603, 634,  851, 882,  942, 973,
+  509, 540,  912, 943,  728, 759,  759, 790,  972, 1003, 572, 603,  882, 913,
+  697, 728,  790, 821,  666, 697,  821, 852,  943, 974,  635, 666,  852, 883,
+  541, 572,  913, 944,  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,
+  791, 822,  510, 541,  944, 975,  974, 1005, 698, 729,  822, 853,  573, 604,
+  914, 945,  667, 698,  853, 884,  636, 667,  884, 915,  975, 1006, 542, 573,
+  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
+  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
+  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
+  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
+  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
+  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
+  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
+  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
+  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
+  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
+  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
+  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
+  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
+  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
+  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
+  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
+  990, 1021, 991, 1022, 0,   0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                qtr_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,    0,   0,    0,   0,    1,   32,   1,   1,    32,  32,   2,   33,
+  33,  64,   34,  65,   2,   2,    64,  64,   3,   34,   65,  96,   35,  66,
+  66,  97,   3,   3,    96,  96,   4,   35,   97,  128,  67,  98,   36,  67,
+  98,  129,  4,   4,    68,  99,   99,  130,  128, 128,  5,   36,   129, 160,
+  37,  68,   130, 161,  100, 131,  69,  100,  131, 162,  5,   5,    160, 160,
+  6,   37,   161, 192,  38,  69,   162, 193,  101, 132,  132, 163,  70,  101,
+  163, 194,  6,   6,    192, 192,  7,   38,   133, 164,  193, 224,  102, 133,
+  164, 195,  39,  70,   194, 225,  71,  102,  195, 226,  134, 165,  165, 196,
+  7,   7,    224, 224,  8,   39,   103, 134,  196, 227,  225, 256,  40,  71,
+  226, 257,  166, 197,  72,  103,  227, 258,  135, 166,  197, 228,  104, 135,
+  228, 259,  8,   8,    256, 256,  9,   40,   257, 288,  41,  72,   167, 198,
+  198, 229,  258, 289,  136, 167,  229, 260,  73,  104,  259, 290,  105, 136,
+  260, 291,  199, 230,  9,   9,    168, 199,  230, 261,  288, 288,  10,  41,
+  289, 320,  42,  73,   290, 321,  137, 168,  261, 292,  74,  105,  291, 322,
+  200, 231,  231, 262,  106, 137,  292, 323,  169, 200,  262, 293,  10,  10,
+  320, 320,  11,  42,   321, 352,  43,  74,   138, 169,  293, 324,  322, 353,
+  232, 263,  75,  106,  201, 232,  263, 294,  323, 354,  170, 201,  294, 325,
+  107, 138,  324, 355,  11,  11,   352, 352,  12,  43,   233, 264,  264, 295,
+  353, 384,  139, 170,  325, 356,  44,  75,   354, 385,  202, 233,  295, 326,
+  76,  107,  355, 386,  171, 202,  326, 357,  108, 139,  356, 387,  265, 296,
+  234, 265,  296, 327,  12,  12,   140, 171,  357, 388,  384, 384,  13,  44,
+  203, 234,  327, 358,  385, 416,  45,  76,   386, 417,  77,  108,  387, 418,
+  172, 203,  358, 389,  266, 297,  297, 328,  109, 140,  235, 266,  328, 359,
+  388, 419,  204, 235,  359, 390,  141, 172,  389, 420,  13,  13,   416, 416,
+  14,  45,   417, 448,  46,  77,   298, 329,  418, 449,  267, 298,  329, 360,
+  78,  109,  173, 204,  390, 421,  419, 450,  236, 267,  360, 391,  110, 141,
+  420, 451,  205, 236,  391, 422,  142, 173,  299, 330,  330, 361,  421, 452,
+  14,  14,   268, 299,  361, 392,  448, 448,  15,  46,   449, 480,  47,  78,
+  450, 481,  174, 205,  422, 453,  237, 268,  392, 423,  79,  110,  451, 482,
+  111, 142,  452, 483,  331, 362,  300, 331,  362, 393,  206, 237,  423, 454,
+  143, 174,  269, 300,  393, 424,  453, 484,  238, 269,  424, 455,  175, 206,
+  454, 485,  332, 363,  363, 394,  301, 332,  394, 425,  207, 238,  455, 486,
+  270, 301,  425, 456,  364, 395,  239, 270,  456, 487,  333, 364,  395, 426,
+  302, 333,  426, 457,  271, 302,  457, 488,  365, 396,  396, 427,  334, 365,
+  427, 458,  303, 334,  458, 489,  397, 428,  366, 397,  428, 459,  335, 366,
+  459, 490,  398, 429,  429, 460,  367, 398,  460, 491,  430, 461,  399, 430,
+  461, 492,  431, 462,  462, 493,  463, 494,  15,  15,   480, 480,  16,  47,
+  481, 512,  48,  79,   482, 513,  80,  111,  483, 514,  112, 143,  484, 515,
+  144, 175,  485, 516,  16,  16,   512, 512,  17,  48,   513, 544,  176, 207,
+  486, 517,  49,  80,   514, 545,  81,  112,  515, 546,  113, 144,  208, 239,
+  487, 518,  516, 547,  145, 176,  517, 548,  240, 271,  488, 519,  17,  17,
+  544, 544,  18,  49,   177, 208,  518, 549,  545, 576,  50,  81,   546, 577,
+  82,  113,  547, 578,  272, 303,  489, 520,  209, 240,  519, 550,  114, 145,
+  548, 579,  146, 177,  549, 580,  241, 272,  520, 551,  304, 335,  490, 521,
+  178, 209,  550, 581,  18,  18,   576, 576,  19,  50,   577, 608,  51,  82,
+  578, 609,  83,  114,  273, 304,  521, 552,  579, 610,  210, 241,  551, 582,
+  115, 146,  336, 367,  491, 522,  580, 611,  147, 178,  581, 612,  242, 273,
+  552, 583,  305, 336,  522, 553,  179, 210,  582, 613,  19,  19,   368, 399,
+  492, 523,  608, 608,  20,  51,   609, 640,  52,  83,   610, 641,  274, 305,
+  553, 584,  84,  115,  611, 642,  211, 242,  337, 368,  523, 554,  583, 614,
+  116, 147,  612, 643,  306, 337,  554, 585,  148, 179,  243, 274,  400, 431,
+  493, 524,  584, 615,  613, 644,  369, 400,  524, 555,  180, 211,  614, 645,
+  20,  20,   640, 640,  21,  52,   275, 306,  585, 616,  641, 672,  53,  84,
+  338, 369,  555, 586,  642, 673,  212, 243,  615, 646,  85,  116,  643, 674,
+  432, 463,  494, 525,  117, 148,  644, 675,  401, 432,  525, 556,  307, 338,
+  586, 617,  244, 275,  616, 647,  149, 180,  645, 676,  370, 401,  556, 587,
+  181, 212,  646, 677,  276, 307,  464, 495,  495, 526,  617, 648,  339, 370,
+  587, 618,  21,  21,   672, 672,  22,  53,   433, 464,  526, 557,  673, 704,
+  54,  85,   674, 705,  213, 244,  647, 678,  86,  117,  402, 433,  557, 588,
+  675, 706,  118, 149,  308, 339,  618, 649,  676, 707,  245, 276,  371, 402,
+  588, 619,  648, 679,  150, 181,  677, 708,  496, 527,  465, 496,  527, 558,
+  182, 213,  434, 465,  558, 589,  678, 709,  340, 371,  619, 650,  277, 308,
+  649, 680,  22,  22,   704, 704,  23,  54,   403, 434,  589, 620,  705, 736,
+  55,  86,   214, 245,  679, 710,  706, 737,  87,  118,  707, 738,  309, 340,
+  650, 681,  372, 403,  620, 651,  119, 150,  497, 528,  528, 559,  708, 739,
+  246, 277,  680, 711,  466, 497,  559, 590,  151, 182,  709, 740,  435, 466,
+  590, 621,  341, 372,  651, 682,  183, 214,  278, 309,  681, 712,  710, 741,
+  404, 435,  621, 652,  23,  23,   736, 736,  24,  55,   737, 768,  215, 246,
+  529, 560,  711, 742,  56,  87,   498, 529,  560, 591,  738, 769,  310, 341,
+  682, 713,  88,  119,  373, 404,  652, 683,  739, 770,  467, 498,  591, 622,
+  120, 151,  740, 771,  247, 278,  712, 743,  436, 467,  622, 653,  152, 183,
+  741, 772,  342, 373,  683, 714,  279, 310,  405, 436,  653, 684,  713, 744,
+  184, 215,  742, 773,  530, 561,  561, 592,  499, 530,  592, 623,  24,  24,
+  216, 247,  468, 499,  623, 654,  743, 774,  768, 768,  25,  56,   769, 800,
+  374, 405,  684, 715,  57,  88,   311, 342,  714, 745,  770, 801,  89,  120,
+  771, 802,  437, 468,  654, 685,  248, 279,  744, 775,  121, 152,  772, 803,
+  562, 593,  153, 184,  343, 374,  531, 562,  593, 624,  715, 746,  773, 804,
+  406, 437,  685, 716,  500, 531,  624, 655,  280, 311,  745, 776,  185, 216,
+  774, 805,  469, 500,  655, 686,  375, 406,  716, 747,  217, 248,  775, 806,
+  25,  25,   312, 343,  746, 777,  800, 800,  26,  57,   801, 832,  58,  89,
+  438, 469,  686, 717,  802, 833,  90,  121,  563, 594,  594, 625,  803, 834,
+  249, 280,  532, 563,  625, 656,  776, 807,  122, 153,  804, 835,  344, 375,
+  501, 532,  656, 687,  747, 778,  407, 438,  717, 748,  154, 185,  805, 836,
+  281, 312,  777, 808,  470, 501,  687, 718,  186, 217,  806, 837,  376, 407,
+  748, 779,  595, 626,  564, 595,  626, 657,  218, 249,  313, 344,  439, 470,
+  718, 749,  778, 809,  807, 838,  26,  26,   832, 832,  27,  58,   533, 564,
+  657, 688,  833, 864,  59,  90,   834, 865,  91,  122,  835, 866,  250, 281,
+  502, 533,  688, 719,  808, 839,  123, 154,  408, 439,  749, 780,  836, 867,
+  345, 376,  779, 810,  155, 186,  471, 502,  719, 750,  837, 868,  282, 313,
+  809, 840,  596, 627,  627, 658,  187, 218,  565, 596,  658, 689,  838, 869,
+  377, 408,  780, 811,  440, 471,  750, 781,  534, 565,  689, 720,  314, 345,
+  810, 841,  219, 250,  839, 870,  27,  27,   864, 864,  28,  59,   503, 534,
+  720, 751,  865, 896,  60,  91,   866, 897,  92,  123,  251, 282,  840, 871,
+  867, 898,  409, 440,  781, 812,  346, 377,  811, 842,  124, 155,  628, 659,
+  868, 899,  472, 503,  751, 782,  597, 628,  659, 690,  566, 597,  690, 721,
+  156, 187,  869, 900,  283, 314,  841, 872,  535, 566,  721, 752,  188, 219,
+  378, 409,  812, 843,  870, 901,  441, 472,  782, 813,  315, 346,  842, 873,
+  504, 535,  752, 783,  220, 251,  871, 902,  28,  28,   629, 660,  660, 691,
+  896, 896,  29,  60,   897, 928,  61,  92,   410, 441,  598, 629,  691, 722,
+  813, 844,  898, 929,  252, 283,  872, 903,  93,  124,  347, 378,  473, 504,
+  783, 814,  843, 874,  899, 930,  567, 598,  722, 753,  125, 156,  900, 931,
+  284, 315,  536, 567,  753, 784,  873, 904,  157, 188,  901, 932,  442, 473,
+  814, 845,  379, 410,  844, 875,  189, 220,  902, 933,  505, 536,  784, 815,
+  661, 692,  316, 347,  630, 661,  692, 723,  874, 905,  221, 252,  599, 630,
+  723, 754,  903, 934,  411, 442,  845, 876,  29,  29,   568, 599,  754, 785,
+  928, 928,  30,  61,   474, 505,  815, 846,  929, 960,  62,  93,   930, 961,
+  253, 284,  348, 379,  875, 906,  904, 935,  94,  125,  931, 962,  537, 568,
+  785, 816,  126, 157,  932, 963,  285, 316,  905, 936,  158, 189,  443, 474,
+  662, 693,  693, 724,  846, 877,  933, 964,  380, 411,  876, 907,  631, 662,
+  724, 755,  506, 537,  816, 847,  190, 221,  934, 965,  600, 631,  755, 786,
+  317, 348,  906, 937,  222, 253,  569, 600,  786, 817,  935, 966,  412, 443,
+  877, 908,  475, 506,  847, 878,  30,  30,   960, 960,  31,  62,   349, 380,
+  907, 938,  961, 992,  254, 285,  936, 967,  63,  94,   538, 569,  817, 848,
+  962, 993,  694, 725,  95,  126,  663, 694,  725, 756,  963, 994,  632, 663,
+  756, 787,  127, 158,  964, 995,  444, 475,  878, 909,  286, 317,  937, 968,
+  381, 412,  507, 538,  848, 879,  908, 939,  159, 190,  601, 632,  787, 818,
+  965, 996,  191, 222,  966, 997,  318, 349,  570, 601,  818, 849,  938, 969,
+  476, 507,  879, 910,  223, 254,  413, 444,  909, 940,  967, 998,  695, 726,
+  726, 757,  664, 695,  757, 788,  539, 570,  849, 880,  350, 381,  939, 970,
+  255, 286,  633, 664,  788, 819,  968, 999,  445, 476,  602, 633,  819, 850,
+  910, 941,  508, 539,  880, 911,  287, 318,  969, 1000, 382, 413,  940, 971,
+  571, 602,  850, 881,  727, 758,  696, 727,  758, 789,  319, 350,  970, 1001,
+  477, 508,  665, 696,  789, 820,  911, 942,  414, 445,  941, 972,  540, 571,
+  881, 912,  634, 665,  820, 851,  351, 382,  971, 1002, 603, 634,  851, 882,
+  446, 477,  942, 973,  509, 540,  912, 943,  383, 414,  728, 759,  759, 790,
+  972, 1003, 572, 603,  882, 913,  697, 728,  790, 821,  666, 697,  821, 852,
+  478, 509,  943, 974,  635, 666,  852, 883,  415, 446,  541, 572,  913, 944,
+  973, 1004, 604, 635,  883, 914,  760, 791,  729, 760,  791, 822,  510, 541,
+  944, 975,  447, 478,  974, 1005, 698, 729,  822, 853,  573, 604,  914, 945,
+  667, 698,  853, 884,  636, 667,  884, 915,  479, 510,  975, 1006, 542, 573,
+  945, 976,  761, 792,  792, 823,  605, 636,  915, 946,  730, 761,  823, 854,
+  699, 730,  854, 885,  511, 542,  976, 1007, 574, 605,  946, 977,  668, 699,
+  885, 916,  637, 668,  916, 947,  543, 574,  793, 824,  977, 1008, 762, 793,
+  824, 855,  731, 762,  855, 886,  606, 637,  947, 978,  700, 731,  886, 917,
+  669, 700,  917, 948,  575, 606,  978, 1009, 638, 669,  948, 979,  794, 825,
+  825, 856,  763, 794,  856, 887,  732, 763,  887, 918,  607, 638,  979, 1010,
+  701, 732,  918, 949,  670, 701,  949, 980,  826, 857,  795, 826,  857, 888,
+  764, 795,  888, 919,  639, 670,  980, 1011, 733, 764,  919, 950,  702, 733,
+  950, 981,  671, 702,  981, 1012, 827, 858,  858, 889,  796, 827,  889, 920,
+  765, 796,  920, 951,  734, 765,  951, 982,  703, 734,  982, 1013, 859, 890,
+  828, 859,  890, 921,  797, 828,  921, 952,  766, 797,  952, 983,  735, 766,
+  983, 1014, 860, 891,  891, 922,  829, 860,  922, 953,  798, 829,  953, 984,
+  767, 798,  984, 1015, 892, 923,  861, 892,  923, 954,  830, 861,  954, 985,
+  799, 830,  985, 1016, 893, 924,  924, 955,  862, 893,  955, 986,  831, 862,
+  986, 1017, 925, 956,  894, 925,  956, 987,  863, 894,  987, 1018, 926, 957,
+  957, 988,  895, 926,  988, 1019, 958, 989,  927, 958,  989, 1020, 959, 990,
+  990, 1021, 991, 1022, 0,   0
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
   0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_4x4[16]) = {
   0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
 };
@@ -527,6 +2849,56 @@
   0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
+  0,  1,  4,  9,  2,  3,  6,  11, 5,  7,  8,  13, 10, 12, 14, 17,
+  15, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
+  0, 1, 4, 9,  15, 19, 24, 28, 2,  3,  6,  11, 16, 21, 25, 29,
+  5, 7, 8, 13, 18, 22, 26, 30, 10, 12, 14, 17, 20, 23, 27, 31,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
+  0, 4, 8,  12, 16, 20, 24, 28, 1, 5, 9,  13, 17, 21, 25, 29,
+  2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 1, 9,  17, 25, 33, 41, 49, 57,
+  2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+  4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+  6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
+  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_8x8[64]) = {
   0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
   2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
@@ -548,6 +2920,346 @@
   25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
 };
 
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
+  0,  1,  3,   6,   10,  15,  21,  28,  2,  4,   7,   11,  16,  22,  29,  36,
+  5,  8,  12,  17,  23,  30,  37,  44,  9,  13,  18,  24,  31,  38,  45,  52,
+  14, 19, 25,  32,  39,  46,  53,  60,  20, 26,  33,  40,  47,  54,  61,  68,
+  27, 34, 41,  48,  55,  62,  69,  76,  35, 42,  49,  56,  63,  70,  77,  84,
+  43, 50, 57,  64,  71,  78,  85,  92,  51, 58,  65,  72,  79,  86,  93,  100,
+  59, 66, 73,  80,  87,  94,  101, 107, 67, 74,  81,  88,  95,  102, 108, 113,
+  75, 82, 89,  96,  103, 109, 114, 118, 83, 90,  97,  104, 110, 115, 119, 122,
+  91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
+  0,  1,  3,  6,  10, 15, 21, 28, 36, 44,  52,  60,  68,  76,  84,  92,
+  2,  4,  7,  11, 16, 22, 29, 37, 45, 53,  61,  69,  77,  85,  93,  100,
+  5,  8,  12, 17, 23, 30, 38, 46, 54, 62,  70,  78,  86,  94,  101, 107,
+  9,  13, 18, 24, 31, 39, 47, 55, 63, 71,  79,  87,  95,  102, 108, 113,
+  14, 19, 25, 32, 40, 48, 56, 64, 72, 80,  88,  96,  103, 109, 114, 118,
+  20, 26, 33, 41, 49, 57, 65, 73, 81, 89,  97,  104, 110, 115, 119, 122,
+  27, 34, 42, 50, 58, 66, 74, 82, 90, 98,  105, 111, 116, 120, 123, 125,
+  35, 43, 51, 59, 67, 75, 83, 91, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 1,  17, 33, 49, 65, 81, 97,  113,
+  2,  18, 34, 50, 66, 82, 98,  114, 3,  19, 35, 51, 67, 83, 99,  115,
+  4,  20, 36, 52, 68, 84, 100, 116, 5,  21, 37, 53, 69, 85, 101, 117,
+  6,  22, 38, 54, 70, 86, 102, 118, 7,  23, 39, 55, 71, 87, 103, 119,
+  8,  24, 40, 56, 72, 88, 104, 120, 9,  25, 41, 57, 73, 89, 105, 121,
+  10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+  12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+  14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = {
+  0, 8,  16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96,  104, 112, 120,
+  1, 9,  17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,  105, 113, 121,
+  2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98,  106, 114, 122,
+  3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,  107, 115, 123,
+  4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+  5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+  6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+  7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+};
+#endif
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
+  120, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,  106,
+  121, 136, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,  93,  107,
+  122, 137, 152, 9,   13,  18,  24,  31,  39,  48,  58,  69,  81,  94,  108,
+  123, 138, 153, 168, 14,  19,  25,  32,  40,  49,  59,  70,  82,  95,  109,
+  124, 139, 154, 169, 184, 20,  26,  33,  41,  50,  60,  71,  83,  96,  110,
+  125, 140, 155, 170, 185, 200, 27,  34,  42,  51,  61,  72,  84,  97,  111,
+  126, 141, 156, 171, 186, 201, 216, 35,  43,  52,  62,  73,  85,  98,  112,
+  127, 142, 157, 172, 187, 202, 217, 232, 44,  53,  63,  74,  86,  99,  113,
+  128, 143, 158, 173, 188, 203, 218, 233, 248, 54,  64,  75,  87,  100, 114,
+  129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65,  76,  88,  101, 115,
+  130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77,  89,  102, 116,
+  131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90,  103, 117,
+  132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118,
+  133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119,
+  134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344,
+  135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345,
+  360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346,
+  361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347,
+  362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348,
+  363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349,
+  364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350,
+  365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351,
+  366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352,
+  367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353,
+  368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354,
+  369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355,
+  370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356,
+  371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357,
+  372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358,
+  373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359,
+  374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506,
+  375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507,
+  509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
+  0,   1,   3,   6,   10,  15,  21,  28,  36,  45,  55,  66,  78,  91,  105,
+  120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+  360, 376, 2,   4,   7,   11,  16,  22,  29,  37,  46,  56,  67,  79,  92,
+  106, 121, 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329,
+  345, 361, 377, 392, 5,   8,   12,  17,  23,  30,  38,  47,  57,  68,  80,
+  93,  107, 122, 138, 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314,
+  330, 346, 362, 378, 393, 407, 9,   13,  18,  24,  31,  39,  48,  58,  69,
+  81,  94,  108, 123, 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299,
+  315, 331, 347, 363, 379, 394, 408, 421, 14,  19,  25,  32,  40,  49,  59,
+  70,  82,  95,  109, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284,
+  300, 316, 332, 348, 364, 380, 395, 409, 422, 434, 20,  26,  33,  41,  50,
+  60,  71,  83,  96,  110, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269,
+  285, 301, 317, 333, 349, 365, 381, 396, 410, 423, 435, 446, 27,  34,  42,
+  51,  61,  72,  84,  97,  111, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  270, 286, 302, 318, 334, 350, 366, 382, 397, 411, 424, 436, 447, 457, 35,
+  43,  52,  62,  73,  85,  98,  112, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 398, 412, 425, 437, 448, 458,
+  467, 44,  53,  63,  74,  86,  99,  113, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 399, 413, 426, 438, 449,
+  459, 468, 476, 54,  64,  75,  87,  100, 114, 129, 145, 161, 177, 193, 209,
+  225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 400, 414, 427, 439,
+  450, 460, 469, 477, 484, 65,  76,  88,  101, 115, 130, 146, 162, 178, 194,
+  210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 401, 415, 428,
+  440, 451, 461, 470, 478, 485, 491, 77,  89,  102, 116, 131, 147, 163, 179,
+  195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371, 387, 402, 416,
+  429, 441, 452, 462, 471, 479, 486, 492, 497, 90,  103, 117, 132, 148, 164,
+  180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340, 356, 372, 388, 403,
+  417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 104, 118, 133, 149,
+  165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325, 341, 357, 373, 389,
+  404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506, 119, 134,
+  150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342, 358, 374,
+  390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507, 509,
+  135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
+  375, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+  510, 511,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
+  0,  32, 64, 96,  128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  1,  33, 65, 97,  129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  2,  34, 66, 98,  130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  3,  35, 67, 99,  131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  4,  36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  5,  37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  6,  38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  7,  39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  8,  40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  9,  41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = {
+  0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176, 192, 208, 224,
+  240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+  480, 496, 1,   17,  33,  49,  65,  81,  97,  113, 129, 145, 161, 177, 193,
+  209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+  449, 465, 481, 497, 2,   18,  34,  50,  66,  82,  98,  114, 130, 146, 162,
+  178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+  418, 434, 450, 466, 482, 498, 3,   19,  35,  51,  67,  83,  99,  115, 131,
+  147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+  387, 403, 419, 435, 451, 467, 483, 499, 4,   20,  36,  52,  68,  84,  100,
+  116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+  356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5,   21,  37,  53,  69,
+  85,  101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+  325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6,   22,  38,
+  54,  70,  86,  102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+  294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+  23,  39,  55,  71,  87,  103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+  503, 8,   24,  40,  56,  72,  88,  104, 120, 136, 152, 168, 184, 200, 216,
+  232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+  472, 488, 504, 9,   25,  41,  57,  73,  89,  105, 121, 137, 153, 169, 185,
+  201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+  441, 457, 473, 489, 505, 10,  26,  42,  58,  74,  90,  106, 122, 138, 154,
+  170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+  410, 426, 442, 458, 474, 490, 506, 11,  27,  43,  59,  75,  91,  107, 123,
+  139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+  379, 395, 411, 427, 443, 459, 475, 491, 507, 12,  28,  44,  60,  76,  92,
+  108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+  348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13,  29,  45,  61,
+  77,  93,  109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+  317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14,  30,
+  46,  62,  78,  94,  110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+  286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+  15,  31,  47,  63,  79,  95,  111, 127, 143, 159, 175, 191, 207, 223, 239,
+  255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+  495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+  270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+  285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+  300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+  315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+  330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+  345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+  360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+  375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+  390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+  405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+  420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+  435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+  450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+  465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+  495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+  510, 511,
+};
+
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
+  0,  16, 32, 48, 64, 80, 96,  112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1,  17, 33, 49, 65, 81, 97,  113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2,  18, 34, 50, 66, 82, 98,  114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3,  19, 35, 51, 67, 83, 99,  115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4,  20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5,  21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6,  22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7,  23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8,  24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9,  25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+  105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+  135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+  165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+  180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+  195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+  210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+  225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+  255,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, av1_col_iscan_16x16[256]) = {
   0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
   1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
@@ -609,6 +3321,167 @@
   255,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
+  0,   32,   64,  96,   128, 160,  192, 224,  256, 288,  320, 352,  384, 416,
+  448, 480,  512, 544,  576, 608,  640, 672,  704, 736,  768, 800,  832, 864,
+  896, 928,  960, 992,  1,   33,   65,  97,   129, 161,  193, 225,  257, 289,
+  321, 353,  385, 417,  449, 481,  513, 545,  577, 609,  641, 673,  705, 737,
+  769, 801,  833, 865,  897, 929,  961, 993,  2,   34,   66,  98,   130, 162,
+  194, 226,  258, 290,  322, 354,  386, 418,  450, 482,  514, 546,  578, 610,
+  642, 674,  706, 738,  770, 802,  834, 866,  898, 930,  962, 994,  3,   35,
+  67,  99,   131, 163,  195, 227,  259, 291,  323, 355,  387, 419,  451, 483,
+  515, 547,  579, 611,  643, 675,  707, 739,  771, 803,  835, 867,  899, 931,
+  963, 995,  4,   36,   68,  100,  132, 164,  196, 228,  260, 292,  324, 356,
+  388, 420,  452, 484,  516, 548,  580, 612,  644, 676,  708, 740,  772, 804,
+  836, 868,  900, 932,  964, 996,  5,   37,   69,  101,  133, 165,  197, 229,
+  261, 293,  325, 357,  389, 421,  453, 485,  517, 549,  581, 613,  645, 677,
+  709, 741,  773, 805,  837, 869,  901, 933,  965, 997,  6,   38,   70,  102,
+  134, 166,  198, 230,  262, 294,  326, 358,  390, 422,  454, 486,  518, 550,
+  582, 614,  646, 678,  710, 742,  774, 806,  838, 870,  902, 934,  966, 998,
+  7,   39,   71,  103,  135, 167,  199, 231,  263, 295,  327, 359,  391, 423,
+  455, 487,  519, 551,  583, 615,  647, 679,  711, 743,  775, 807,  839, 871,
+  903, 935,  967, 999,  8,   40,   72,  104,  136, 168,  200, 232,  264, 296,
+  328, 360,  392, 424,  456, 488,  520, 552,  584, 616,  648, 680,  712, 744,
+  776, 808,  840, 872,  904, 936,  968, 1000, 9,   41,   73,  105,  137, 169,
+  201, 233,  265, 297,  329, 361,  393, 425,  457, 489,  521, 553,  585, 617,
+  649, 681,  713, 745,  777, 809,  841, 873,  905, 937,  969, 1001, 10,  42,
+  74,  106,  138, 170,  202, 234,  266, 298,  330, 362,  394, 426,  458, 490,
+  522, 554,  586, 618,  650, 682,  714, 746,  778, 810,  842, 874,  906, 938,
+  970, 1002, 11,  43,   75,  107,  139, 171,  203, 235,  267, 299,  331, 363,
+  395, 427,  459, 491,  523, 555,  587, 619,  651, 683,  715, 747,  779, 811,
+  843, 875,  907, 939,  971, 1003, 12,  44,   76,  108,  140, 172,  204, 236,
+  268, 300,  332, 364,  396, 428,  460, 492,  524, 556,  588, 620,  652, 684,
+  716, 748,  780, 812,  844, 876,  908, 940,  972, 1004, 13,  45,   77,  109,
+  141, 173,  205, 237,  269, 301,  333, 365,  397, 429,  461, 493,  525, 557,
+  589, 621,  653, 685,  717, 749,  781, 813,  845, 877,  909, 941,  973, 1005,
+  14,  46,   78,  110,  142, 174,  206, 238,  270, 302,  334, 366,  398, 430,
+  462, 494,  526, 558,  590, 622,  654, 686,  718, 750,  782, 814,  846, 878,
+  910, 942,  974, 1006, 15,  47,   79,  111,  143, 175,  207, 239,  271, 303,
+  335, 367,  399, 431,  463, 495,  527, 559,  591, 623,  655, 687,  719, 751,
+  783, 815,  847, 879,  911, 943,  975, 1007, 16,  48,   80,  112,  144, 176,
+  208, 240,  272, 304,  336, 368,  400, 432,  464, 496,  528, 560,  592, 624,
+  656, 688,  720, 752,  784, 816,  848, 880,  912, 944,  976, 1008, 17,  49,
+  81,  113,  145, 177,  209, 241,  273, 305,  337, 369,  401, 433,  465, 497,
+  529, 561,  593, 625,  657, 689,  721, 753,  785, 817,  849, 881,  913, 945,
+  977, 1009, 18,  50,   82,  114,  146, 178,  210, 242,  274, 306,  338, 370,
+  402, 434,  466, 498,  530, 562,  594, 626,  658, 690,  722, 754,  786, 818,
+  850, 882,  914, 946,  978, 1010, 19,  51,   83,  115,  147, 179,  211, 243,
+  275, 307,  339, 371,  403, 435,  467, 499,  531, 563,  595, 627,  659, 691,
+  723, 755,  787, 819,  851, 883,  915, 947,  979, 1011, 20,  52,   84,  116,
+  148, 180,  212, 244,  276, 308,  340, 372,  404, 436,  468, 500,  532, 564,
+  596, 628,  660, 692,  724, 756,  788, 820,  852, 884,  916, 948,  980, 1012,
+  21,  53,   85,  117,  149, 181,  213, 245,  277, 309,  341, 373,  405, 437,
+  469, 501,  533, 565,  597, 629,  661, 693,  725, 757,  789, 821,  853, 885,
+  917, 949,  981, 1013, 22,  54,   86,  118,  150, 182,  214, 246,  278, 310,
+  342, 374,  406, 438,  470, 502,  534, 566,  598, 630,  662, 694,  726, 758,
+  790, 822,  854, 886,  918, 950,  982, 1014, 23,  55,   87,  119,  151, 183,
+  215, 247,  279, 311,  343, 375,  407, 439,  471, 503,  535, 567,  599, 631,
+  663, 695,  727, 759,  791, 823,  855, 887,  919, 951,  983, 1015, 24,  56,
+  88,  120,  152, 184,  216, 248,  280, 312,  344, 376,  408, 440,  472, 504,
+  536, 568,  600, 632,  664, 696,  728, 760,  792, 824,  856, 888,  920, 952,
+  984, 1016, 25,  57,   89,  121,  153, 185,  217, 249,  281, 313,  345, 377,
+  409, 441,  473, 505,  537, 569,  601, 633,  665, 697,  729, 761,  793, 825,
+  857, 889,  921, 953,  985, 1017, 26,  58,   90,  122,  154, 186,  218, 250,
+  282, 314,  346, 378,  410, 442,  474, 506,  538, 570,  602, 634,  666, 698,
+  730, 762,  794, 826,  858, 890,  922, 954,  986, 1018, 27,  59,   91,  123,
+  155, 187,  219, 251,  283, 315,  347, 379,  411, 443,  475, 507,  539, 571,
+  603, 635,  667, 699,  731, 763,  795, 827,  859, 891,  923, 955,  987, 1019,
+  28,  60,   92,  124,  156, 188,  220, 252,  284, 316,  348, 380,  412, 444,
+  476, 508,  540, 572,  604, 636,  668, 700,  732, 764,  796, 828,  860, 892,
+  924, 956,  988, 1020, 29,  61,   93,  125,  157, 189,  221, 253,  285, 317,
+  349, 381,  413, 445,  477, 509,  541, 573,  605, 637,  669, 701,  733, 765,
+  797, 829,  861, 893,  925, 957,  989, 1021, 30,  62,   94,  126,  158, 190,
+  222, 254,  286, 318,  350, 382,  414, 446,  478, 510,  542, 574,  606, 638,
+  670, 702,  734, 766,  798, 830,  862, 894,  926, 958,  990, 1022, 31,  63,
+  95,  127,  159, 191,  223, 255,  287, 319,  351, 383,  415, 447,  479, 511,
+  543, 575,  607, 639,  671, 703,  735, 767,  799, 831,  863, 895,  927, 959,
+  991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
+  0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,
+  13,   14,   15,   16,   17,   18,   19,   20,   21,   22,   23,   24,   25,
+  26,   27,   28,   29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+  39,   40,   41,   42,   43,   44,   45,   46,   47,   48,   49,   50,   51,
+  52,   53,   54,   55,   56,   57,   58,   59,   60,   61,   62,   63,   64,
+  65,   66,   67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
+  78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,   89,   90,
+  91,   92,   93,   94,   95,   96,   97,   98,   99,   100,  101,  102,  103,
+  104,  105,  106,  107,  108,  109,  110,  111,  112,  113,  114,  115,  116,
+  117,  118,  119,  120,  121,  122,  123,  124,  125,  126,  127,  128,  129,
+  130,  131,  132,  133,  134,  135,  136,  137,  138,  139,  140,  141,  142,
+  143,  144,  145,  146,  147,  148,  149,  150,  151,  152,  153,  154,  155,
+  156,  157,  158,  159,  160,  161,  162,  163,  164,  165,  166,  167,  168,
+  169,  170,  171,  172,  173,  174,  175,  176,  177,  178,  179,  180,  181,
+  182,  183,  184,  185,  186,  187,  188,  189,  190,  191,  192,  193,  194,
+  195,  196,  197,  198,  199,  200,  201,  202,  203,  204,  205,  206,  207,
+  208,  209,  210,  211,  212,  213,  214,  215,  216,  217,  218,  219,  220,
+  221,  222,  223,  224,  225,  226,  227,  228,  229,  230,  231,  232,  233,
+  234,  235,  236,  237,  238,  239,  240,  241,  242,  243,  244,  245,  246,
+  247,  248,  249,  250,  251,  252,  253,  254,  255,  256,  257,  258,  259,
+  260,  261,  262,  263,  264,  265,  266,  267,  268,  269,  270,  271,  272,
+  273,  274,  275,  276,  277,  278,  279,  280,  281,  282,  283,  284,  285,
+  286,  287,  288,  289,  290,  291,  292,  293,  294,  295,  296,  297,  298,
+  299,  300,  301,  302,  303,  304,  305,  306,  307,  308,  309,  310,  311,
+  312,  313,  314,  315,  316,  317,  318,  319,  320,  321,  322,  323,  324,
+  325,  326,  327,  328,  329,  330,  331,  332,  333,  334,  335,  336,  337,
+  338,  339,  340,  341,  342,  343,  344,  345,  346,  347,  348,  349,  350,
+  351,  352,  353,  354,  355,  356,  357,  358,  359,  360,  361,  362,  363,
+  364,  365,  366,  367,  368,  369,  370,  371,  372,  373,  374,  375,  376,
+  377,  378,  379,  380,  381,  382,  383,  384,  385,  386,  387,  388,  389,
+  390,  391,  392,  393,  394,  395,  396,  397,  398,  399,  400,  401,  402,
+  403,  404,  405,  406,  407,  408,  409,  410,  411,  412,  413,  414,  415,
+  416,  417,  418,  419,  420,  421,  422,  423,  424,  425,  426,  427,  428,
+  429,  430,  431,  432,  433,  434,  435,  436,  437,  438,  439,  440,  441,
+  442,  443,  444,  445,  446,  447,  448,  449,  450,  451,  452,  453,  454,
+  455,  456,  457,  458,  459,  460,  461,  462,  463,  464,  465,  466,  467,
+  468,  469,  470,  471,  472,  473,  474,  475,  476,  477,  478,  479,  480,
+  481,  482,  483,  484,  485,  486,  487,  488,  489,  490,  491,  492,  493,
+  494,  495,  496,  497,  498,  499,  500,  501,  502,  503,  504,  505,  506,
+  507,  508,  509,  510,  511,  512,  513,  514,  515,  516,  517,  518,  519,
+  520,  521,  522,  523,  524,  525,  526,  527,  528,  529,  530,  531,  532,
+  533,  534,  535,  536,  537,  538,  539,  540,  541,  542,  543,  544,  545,
+  546,  547,  548,  549,  550,  551,  552,  553,  554,  555,  556,  557,  558,
+  559,  560,  561,  562,  563,  564,  565,  566,  567,  568,  569,  570,  571,
+  572,  573,  574,  575,  576,  577,  578,  579,  580,  581,  582,  583,  584,
+  585,  586,  587,  588,  589,  590,  591,  592,  593,  594,  595,  596,  597,
+  598,  599,  600,  601,  602,  603,  604,  605,  606,  607,  608,  609,  610,
+  611,  612,  613,  614,  615,  616,  617,  618,  619,  620,  621,  622,  623,
+  624,  625,  626,  627,  628,  629,  630,  631,  632,  633,  634,  635,  636,
+  637,  638,  639,  640,  641,  642,  643,  644,  645,  646,  647,  648,  649,
+  650,  651,  652,  653,  654,  655,  656,  657,  658,  659,  660,  661,  662,
+  663,  664,  665,  666,  667,  668,  669,  670,  671,  672,  673,  674,  675,
+  676,  677,  678,  679,  680,  681,  682,  683,  684,  685,  686,  687,  688,
+  689,  690,  691,  692,  693,  694,  695,  696,  697,  698,  699,  700,  701,
+  702,  703,  704,  705,  706,  707,  708,  709,  710,  711,  712,  713,  714,
+  715,  716,  717,  718,  719,  720,  721,  722,  723,  724,  725,  726,  727,
+  728,  729,  730,  731,  732,  733,  734,  735,  736,  737,  738,  739,  740,
+  741,  742,  743,  744,  745,  746,  747,  748,  749,  750,  751,  752,  753,
+  754,  755,  756,  757,  758,  759,  760,  761,  762,  763,  764,  765,  766,
+  767,  768,  769,  770,  771,  772,  773,  774,  775,  776,  777,  778,  779,
+  780,  781,  782,  783,  784,  785,  786,  787,  788,  789,  790,  791,  792,
+  793,  794,  795,  796,  797,  798,  799,  800,  801,  802,  803,  804,  805,
+  806,  807,  808,  809,  810,  811,  812,  813,  814,  815,  816,  817,  818,
+  819,  820,  821,  822,  823,  824,  825,  826,  827,  828,  829,  830,  831,
+  832,  833,  834,  835,  836,  837,  838,  839,  840,  841,  842,  843,  844,
+  845,  846,  847,  848,  849,  850,  851,  852,  853,  854,  855,  856,  857,
+  858,  859,  860,  861,  862,  863,  864,  865,  866,  867,  868,  869,  870,
+  871,  872,  873,  874,  875,  876,  877,  878,  879,  880,  881,  882,  883,
+  884,  885,  886,  887,  888,  889,  890,  891,  892,  893,  894,  895,  896,
+  897,  898,  899,  900,  901,  902,  903,  904,  905,  906,  907,  908,  909,
+  910,  911,  912,  913,  914,  915,  916,  917,  918,  919,  920,  921,  922,
+  923,  924,  925,  926,  927,  928,  929,  930,  931,  932,  933,  934,  935,
+  936,  937,  938,  939,  940,  941,  942,  943,  944,  945,  946,  947,  948,
+  949,  950,  951,  952,  953,  954,  955,  956,  957,  958,  959,  960,  961,
+  962,  963,  964,  965,  966,  967,  968,  969,  970,  971,  972,  973,  974,
+  975,  976,  977,  978,  979,  980,  981,  982,  983,  984,  985,  986,  987,
+  988,  989,  990,  991,  992,  993,  994,  995,  996,  997,  998,  999,  1000,
+  1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+  1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
   0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
   170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
@@ -691,9 +3564,256 @@
   967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, av1_v2_iscan_32x32[1024]) = {
+  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
+  142,  166,  189,  512,  518,  527,  539,  551,  566,  584,  602,  621,  644,
+  668,  695,  721,  748,  780,  811,  2,    3,    6,    11,   17,   26,   35,
+  45,   58,   73,   90,   106,  123,  146,  168,  193,  513,  519,  528,  540,
+  553,  567,  585,  603,  622,  647,  670,  696,  722,  751,  783,  812,  5,
+  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
+  170,  195,  514,  521,  530,  541,  554,  569,  587,  605,  625,  649,  671,
+  699,  725,  752,  785,  815,  10,   12,   14,   19,   23,   31,   41,   52,
+  65,   81,   96,   113,  133,  152,  175,  201,  515,  522,  531,  542,  556,
+  572,  589,  607,  629,  651,  673,  700,  726,  757,  788,  819,  16,   18,
+  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
+  203,  516,  523,  534,  545,  559,  574,  591,  610,  632,  654,  679,  704,
+  730,  762,  791,  824,  25,   27,   29,   32,   40,   46,   54,   67,   79,
+  94,   109,  127,  143,  164,  185,  210,  517,  525,  535,  547,  561,  578,
+  595,  615,  635,  656,  684,  707,  737,  766,  793,  830,  34,   36,   38,
+  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
+  520,  529,  538,  550,  565,  580,  598,  618,  639,  664,  687,  712,  741,
+  769,  802,  833,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
+  131,  147,  162,  183,  208,  227,  524,  533,  544,  557,  571,  588,  606,
+  623,  645,  667,  692,  720,  747,  776,  806,  838,  57,   61,   63,   66,
+  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  216,  233,  526,
+  536,  548,  562,  577,  593,  613,  633,  653,  676,  701,  727,  756,  786,
+  814,  847,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
+  173,  190,  211,  229,  246,  532,  543,  555,  568,  581,  601,  619,  637,
+  663,  685,  709,  738,  763,  792,  826,  855,  89,   91,   93,   97,   101,
+  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  255,  537,  549,
+  560,  576,  592,  608,  628,  650,  669,  693,  719,  744,  773,  805,  834,
+  862,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
+  221,  236,  251,  267,  546,  558,  570,  583,  600,  617,  636,  657,  680,
+  706,  729,  758,  787,  813,  846,  871,  122,  126,  130,  134,  138,  144,
+  155,  163,  180,  191,  207,  222,  232,  248,  264,  278,  552,  564,  579,
+  594,  609,  630,  648,  666,  688,  715,  742,  768,  797,  827,  856,  877,
+  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  237,  249,
+  262,  275,  289,  563,  575,  590,  604,  620,  638,  660,  683,  705,  728,
+  753,  779,  809,  839,  866,  889,  167,  169,  172,  178,  182,  188,  198,
+  209,  217,  230,  242,  252,  265,  276,  288,  301,  573,  586,  599,  616,
+  634,  652,  672,  694,  716,  743,  767,  794,  825,  850,  874,  899,  192,
+  194,  196,  202,  204,  213,  220,  228,  234,  247,  256,  268,  279,  290,
+  302,  315,  582,  597,  614,  631,  646,  665,  686,  708,  732,  759,  784,
+  810,  837,  863,  886,  908,  214,  215,  218,  223,  226,  231,  239,  244,
+  253,  261,  271,  283,  292,  304,  317,  325,  596,  611,  626,  642,  661,
+  681,  702,  723,  745,  770,  800,  828,  853,  875,  897,  919,  235,  238,
+  240,  243,  245,  250,  257,  263,  270,  280,  287,  298,  307,  319,  329,
+  340,  612,  624,  640,  658,  677,  697,  717,  739,  764,  789,  816,  844,
+  867,  890,  909,  927,  254,  258,  259,  260,  266,  269,  272,  282,  286,
+  296,  303,  312,  323,  333,  341,  355,  627,  641,  655,  674,  690,  713,
+  735,  760,  781,  807,  835,  857,  880,  902,  921,  940,  273,  274,  277,
+  281,  284,  285,  291,  299,  305,  310,  320,  327,  337,  346,  357,  369,
+  643,  659,  675,  689,  710,  733,  754,  777,  803,  831,  851,  872,  892,
+  913,  934,  950,  293,  294,  295,  297,  300,  306,  308,  314,  321,  326,
+  335,  343,  352,  361,  372,  378,  662,  678,  691,  711,  731,  749,  774,
+  798,  822,  848,  869,  887,  906,  925,  942,  961,  309,  311,  313,  316,
+  318,  322,  324,  332,  338,  344,  351,  358,  367,  375,  386,  394,  682,
+  698,  714,  734,  750,  772,  795,  820,  842,  864,  884,  904,  923,  938,
+  954,  967,  328,  330,  331,  334,  336,  339,  342,  348,  354,  359,  366,
+  374,  382,  391,  400,  409,  703,  718,  736,  755,  775,  796,  818,  840,
+  860,  882,  900,  917,  936,  952,  965,  977,  345,  347,  349,  350,  353,
+  356,  360,  364,  371,  376,  383,  389,  395,  406,  412,  423,  724,  740,
+  761,  778,  799,  821,  841,  859,  878,  895,  915,  932,  948,  963,  975,
+  986,  362,  363,  365,  368,  370,  373,  377,  379,  387,  392,  397,  405,
+  411,  420,  428,  439,  746,  765,  782,  804,  823,  843,  861,  879,  894,
+  911,  930,  946,  959,  973,  984,  994,  380,  381,  384,  385,  388,  390,
+  393,  396,  403,  408,  413,  422,  427,  436,  444,  452,  771,  790,  808,
+  832,  849,  865,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
+  398,  399,  401,  402,  404,  407,  410,  414,  419,  425,  429,  437,  442,
+  449,  458,  465,  801,  817,  836,  852,  870,  885,  901,  916,  931,  945,
+  956,  969,  980,  990,  999,  1007, 415,  416,  417,  418,  421,  424,  426,
+  430,  434,  441,  445,  453,  459,  463,  473,  480,  829,  845,  858,  873,
+  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 431,
+  432,  433,  435,  438,  440,  443,  446,  451,  456,  461,  468,  475,  479,
+  488,  494,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
+  989,  996,  1003, 1010, 1016, 447,  448,  450,  454,  455,  457,  460,  462,
+  469,  472,  477,  482,  490,  495,  499,  503,  876,  891,  903,  914,  926,
+  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 464,  466,
+  467,  470,  471,  474,  476,  478,  484,  489,  493,  497,  501,  504,  506,
+  508,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
+  1011, 1015, 1018, 1021, 481,  483,  485,  486,  487,  491,  492,  496,  498,
+  500,  502,  505,  507,  509,  510,  511,  920,  929,  941,  951,  962,  968,
+  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_h2_iscan_32x32[1024]) = {
+  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
+  142,  166,  189,  214,  233,  254,  273,  292,  309,  328,  345,  362,  378,
+  397,  415,  431,  447,  464,  481,  2,    3,    6,    11,   17,   26,   35,
+  45,   58,   73,   90,   106,  123,  146,  168,  193,  215,  236,  255,  274,
+  294,  310,  329,  346,  363,  381,  399,  416,  432,  448,  465,  482,  5,
+  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
+  170,  195,  216,  240,  259,  275,  295,  312,  331,  348,  365,  383,  400,
+  417,  433,  449,  467,  485,  10,   12,   14,   19,   23,   31,   41,   52,
+  65,   81,   96,   113,  133,  152,  175,  201,  221,  243,  260,  280,  297,
+  315,  333,  350,  367,  385,  402,  418,  434,  452,  470,  486,  16,   18,
+  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
+  203,  226,  244,  264,  283,  300,  318,  335,  353,  370,  388,  404,  420,
+  438,  455,  471,  487,  25,   27,   29,   32,   40,   46,   54,   67,   79,
+  94,   109,  127,  143,  164,  185,  210,  231,  250,  269,  285,  304,  322,
+  339,  356,  373,  389,  407,  423,  440,  457,  473,  491,  34,   36,   38,
+  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  219,
+  239,  256,  272,  291,  308,  324,  341,  359,  377,  393,  410,  426,  442,
+  460,  476,  492,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
+  131,  147,  162,  183,  208,  227,  245,  262,  282,  298,  314,  332,  349,
+  364,  379,  396,  412,  430,  446,  462,  478,  495,  57,   61,   63,   66,
+  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  217,  234,  253,
+  270,  286,  305,  321,  337,  354,  371,  387,  403,  419,  435,  451,  468,
+  484,  498,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
+  173,  190,  211,  229,  246,  261,  281,  296,  311,  325,  344,  360,  375,
+  392,  408,  425,  441,  456,  472,  489,  500,  89,   91,   93,   97,   101,
+  110,  118,  132,  141,  157,  171,  186,  206,  224,  241,  257,  271,  287,
+  303,  320,  336,  351,  366,  384,  398,  413,  429,  445,  461,  477,  493,
+  502,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
+  222,  237,  251,  267,  284,  299,  313,  327,  343,  358,  374,  390,  405,
+  422,  437,  453,  469,  483,  497,  505,  122,  126,  130,  134,  138,  144,
+  155,  163,  180,  191,  207,  223,  232,  248,  265,  278,  293,  307,  323,
+  338,  352,  368,  382,  395,  411,  427,  443,  459,  475,  490,  501,  507,
+  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  225,  238,  249,
+  263,  276,  289,  306,  319,  334,  347,  361,  376,  391,  406,  421,  436,
+  450,  463,  479,  496,  504,  509,  167,  169,  172,  178,  182,  188,  198,
+  209,  218,  230,  242,  252,  266,  277,  288,  301,  317,  330,  342,  357,
+  372,  386,  401,  414,  428,  444,  458,  474,  488,  499,  506,  510,  192,
+  194,  196,  202,  204,  213,  220,  228,  235,  247,  258,  268,  279,  290,
+  302,  316,  326,  340,  355,  369,  380,  394,  409,  424,  439,  454,  466,
+  480,  494,  503,  508,  511,  512,  513,  514,  515,  516,  517,  520,  523,
+  526,  532,  537,  545,  551,  561,  573,  581,  596,  610,  625,  642,  661,
+  680,  701,  722,  745,  770,  800,  827,  853,  875,  897,  919,  518,  519,
+  521,  522,  524,  525,  528,  533,  536,  542,  549,  557,  564,  575,  585,
+  597,  611,  623,  640,  656,  676,  696,  717,  739,  763,  789,  815,  844,
+  867,  889,  909,  927,  527,  529,  530,  531,  534,  535,  538,  544,  548,
+  555,  560,  569,  579,  589,  598,  614,  626,  641,  655,  673,  690,  712,
+  735,  760,  780,  806,  834,  857,  880,  902,  921,  940,  539,  540,  541,
+  543,  546,  547,  550,  558,  562,  567,  576,  583,  593,  603,  616,  631,
+  643,  657,  674,  689,  710,  733,  752,  776,  803,  830,  850,  872,  892,
+  913,  934,  950,  552,  553,  554,  556,  559,  563,  565,  571,  577,  582,
+  591,  600,  609,  620,  634,  644,  662,  677,  691,  711,  730,  748,  773,
+  798,  822,  847,  869,  887,  906,  925,  942,  961,  566,  568,  570,  572,
+  574,  578,  580,  588,  594,  601,  608,  617,  629,  637,  652,  665,  681,
+  697,  713,  734,  749,  772,  793,  819,  842,  863,  884,  904,  923,  938,
+  954,  967,  584,  586,  587,  590,  592,  595,  599,  605,  613,  618,  628,
+  636,  648,  660,  671,  686,  702,  718,  736,  753,  774,  794,  818,  840,
+  860,  882,  900,  917,  936,  952,  965,  977,  602,  604,  606,  607,  612,
+  615,  619,  624,  633,  638,  649,  658,  666,  683,  692,  707,  723,  740,
+  761,  777,  799,  820,  841,  859,  877,  895,  915,  932,  948,  963,  975,
+  986,  621,  622,  627,  630,  632,  635,  639,  645,  653,  663,  668,  682,
+  688,  704,  716,  732,  746,  764,  781,  804,  823,  843,  861,  878,  894,
+  911,  930,  946,  959,  973,  984,  994,  646,  647,  650,  651,  654,  659,
+  664,  667,  678,  685,  693,  706,  715,  728,  743,  757,  771,  790,  807,
+  831,  848,  864,  883,  896,  912,  928,  944,  957,  971,  982,  992,  1001,
+  669,  670,  672,  675,  679,  684,  687,  694,  703,  709,  719,  729,  741,
+  754,  767,  783,  801,  816,  835,  851,  870,  885,  901,  916,  931,  945,
+  956,  969,  980,  990,  999,  1007, 695,  698,  699,  700,  705,  708,  714,
+  720,  726,  738,  744,  758,  768,  779,  795,  810,  828,  845,  858,  873,
+  888,  905,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 721,
+  724,  725,  727,  731,  737,  742,  747,  756,  765,  775,  786,  797,  809,
+  825,  837,  854,  868,  881,  893,  907,  924,  937,  949,  960,  972,  981,
+  989,  996,  1003, 1010, 1016, 750,  751,  755,  759,  762,  766,  769,  778,
+  787,  792,  805,  812,  829,  838,  852,  865,  876,  890,  903,  914,  926,
+  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 782,  784,
+  785,  788,  791,  796,  802,  808,  814,  826,  836,  846,  856,  866,  874,
+  886,  898,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
+  1011, 1015, 1018, 1021, 811,  813,  817,  821,  824,  832,  833,  839,  849,
+  855,  862,  871,  879,  891,  899,  908,  920,  929,  941,  951,  962,  968,
+  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_qtr_iscan_32x32[1024]) = {
+  0,    1,    4,    9,    15,   22,   33,   43,   56,   71,   86,   104,  121,
+  142,  166,  189,  256,  268,  286,  310,  334,  364,  400,  435,  471,  510,
+  553,  598,  640,  683,  732,  780,  2,    3,    6,    11,   17,   26,   35,
+  45,   58,   73,   90,   106,  123,  146,  168,  193,  258,  270,  288,  312,
+  338,  366,  402,  437,  473,  516,  557,  600,  642,  687,  736,  782,  5,
+  7,    8,    13,   20,   28,   37,   50,   62,   75,   92,   108,  129,  150,
+  170,  195,  260,  274,  292,  314,  340,  370,  406,  441,  478,  520,  559,
+  604,  646,  689,  740,  788,  10,   12,   14,   19,   23,   31,   41,   52,
+  65,   81,   96,   113,  133,  152,  175,  201,  262,  276,  294,  316,  344,
+  376,  410,  445,  484,  524,  563,  606,  648,  697,  746,  793,  16,   18,
+  21,   24,   30,   39,   48,   59,   69,   83,   100,  119,  137,  158,  181,
+  203,  264,  278,  300,  322,  350,  380,  414,  451,  490,  530,  571,  612,
+  656,  705,  750,  799,  25,   27,   29,   32,   40,   46,   54,   67,   79,
+  94,   109,  127,  143,  164,  185,  210,  266,  282,  302,  326,  354,  388,
+  422,  459,  496,  533,  579,  618,  665,  711,  754,  809,  34,   36,   38,
+  42,   49,   55,   64,   76,   87,   102,  117,  135,  154,  176,  197,  216,
+  272,  289,  308,  332,  362,  392,  427,  465,  504,  545,  585,  626,  671,
+  717,  766,  813,  44,   47,   51,   53,   60,   68,   77,   85,   98,   114,
+  131,  147,  162,  183,  208,  222,  279,  298,  320,  346,  374,  408,  442,
+  475,  511,  551,  592,  638,  681,  726,  772,  821,  57,   61,   63,   66,
+  70,   80,   88,   99,   112,  124,  140,  159,  179,  199,  214,  227,  284,
+  304,  328,  355,  386,  418,  455,  492,  528,  567,  608,  649,  695,  742,
+  786,  833,  72,   74,   78,   82,   84,   95,   103,  115,  125,  139,  156,
+  173,  190,  211,  224,  233,  296,  317,  342,  367,  394,  433,  466,  500,
+  543,  581,  622,  667,  707,  752,  803,  843,  89,   91,   93,   97,   101,
+  110,  118,  132,  141,  157,  171,  186,  206,  220,  231,  239,  306,  330,
+  352,  384,  415,  447,  482,  521,  554,  593,  636,  677,  722,  770,  815,
+  852,  105,  107,  111,  116,  120,  128,  136,  148,  160,  174,  187,  205,
+  218,  229,  237,  244,  323,  347,  371,  398,  431,  463,  498,  534,  573,
+  616,  654,  698,  743,  783,  831,  864,  122,  126,  130,  134,  138,  144,
+  155,  163,  180,  191,  207,  219,  226,  235,  242,  248,  335,  360,  390,
+  419,  449,  485,  518,  549,  587,  630,  672,  715,  760,  805,  845,  872,
+  145,  149,  151,  153,  161,  165,  177,  184,  200,  212,  221,  230,  236,
+  241,  246,  251,  356,  382,  411,  438,  469,  501,  539,  577,  613,  652,
+  690,  730,  776,  822,  858,  886,  167,  169,  172,  178,  182,  188,  198,
+  209,  215,  225,  232,  238,  243,  247,  250,  253,  378,  403,  428,  461,
+  494,  526,  560,  594,  632,  675,  713,  755,  801,  837,  868,  897,  192,
+  194,  196,  202,  204,  213,  217,  223,  228,  234,  240,  245,  249,  252,
+  254,  255,  395,  425,  457,  488,  512,  547,  583,  619,  659,  699,  737,
+  778,  819,  854,  882,  907,  257,  259,  261,  263,  265,  267,  273,  280,
+  285,  297,  307,  324,  336,  357,  379,  396,  424,  452,  479,  508,  541,
+  574,  609,  643,  679,  719,  764,  806,  841,  870,  895,  919,  269,  271,
+  275,  277,  281,  283,  290,  299,  305,  318,  331,  348,  361,  383,  404,
+  426,  453,  476,  506,  535,  568,  601,  634,  669,  708,  748,  789,  829,
+  860,  887,  909,  927,  287,  291,  293,  295,  301,  303,  309,  321,  329,
+  343,  353,  372,  391,  412,  429,  458,  480,  507,  532,  564,  590,  627,
+  663,  703,  733,  773,  816,  847,  876,  901,  921,  940,  311,  313,  315,
+  319,  325,  327,  333,  349,  358,  368,  385,  399,  420,  439,  462,  489,
+  509,  536,  565,  589,  624,  661,  691,  727,  768,  810,  838,  866,  890,
+  913,  934,  950,  337,  339,  341,  345,  351,  359,  363,  375,  387,  397,
+  416,  432,  450,  470,  495,  513,  542,  569,  591,  625,  657,  684,  723,
+  762,  797,  834,  862,  884,  905,  925,  942,  961,  365,  369,  373,  377,
+  381,  389,  393,  409,  421,  434,  448,  464,  486,  502,  527,  548,  575,
+  602,  628,  662,  685,  721,  756,  794,  827,  855,  880,  903,  923,  938,
+  954,  967,  401,  405,  407,  413,  417,  423,  430,  443,  456,  467,  483,
+  499,  519,  540,  561,  584,  610,  635,  664,  692,  724,  757,  792,  825,
+  850,  878,  899,  917,  936,  952,  965,  977,  436,  440,  444,  446,  454,
+  460,  468,  477,  493,  503,  522,  537,  550,  578,  595,  620,  644,  670,
+  704,  728,  763,  795,  826,  849,  873,  893,  915,  932,  948,  963,  975,
+  986,  472,  474,  481,  487,  491,  497,  505,  514,  529,  544,  555,  576,
+  588,  614,  633,  660,  680,  709,  734,  769,  798,  828,  851,  874,  892,
+  911,  930,  946,  959,  973,  984,  994,  515,  517,  523,  525,  531,  538,
+  546,  552,  570,  582,  596,  617,  631,  653,  676,  700,  720,  749,  774,
+  811,  835,  856,  879,  894,  912,  928,  944,  957,  971,  982,  992,  1001,
+  556,  558,  562,  566,  572,  580,  586,  597,  611,  623,  637,  655,  673,
+  693,  714,  738,  765,  790,  817,  839,  863,  881,  900,  916,  931,  945,
+  956,  969,  980,  990,  999,  1007, 599,  603,  605,  607,  615,  621,  629,
+  639,  650,  668,  678,  701,  716,  731,  758,  779,  807,  830,  848,  867,
+  885,  904,  918,  933,  947,  958,  970,  979,  988,  997,  1005, 1012, 641,
+  645,  647,  651,  658,  666,  674,  682,  696,  710,  725,  744,  761,  777,
+  802,  820,  842,  861,  877,  891,  906,  924,  937,  949,  960,  972,  981,
+  989,  996,  1003, 1010, 1016, 686,  688,  694,  702,  706,  712,  718,  729,
+  745,  753,  771,  784,  808,  823,  840,  857,  871,  888,  902,  914,  926,
+  939,  953,  964,  974,  983,  991,  998,  1004, 1009, 1014, 1019, 735,  739,
+  741,  747,  751,  759,  767,  775,  787,  804,  818,  832,  846,  859,  869,
+  883,  896,  910,  922,  935,  943,  955,  966,  976,  985,  993,  1000, 1006,
+  1011, 1015, 1018, 1021, 781,  785,  791,  796,  800,  812,  814,  824,  836,
+  844,  853,  865,  875,  889,  898,  908,  920,  929,  941,  951,  962,  968,
+  978,  987,  995,  1002, 1008, 1013, 1017, 1020, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
 const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
 #if CONFIG_CB4X4
-  // TODO(jingning): use 2x2 scan order
   { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
 #endif
   { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
@@ -702,40 +3822,404 @@
   { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-const SCAN_ORDER av1_scan_orders[TX_SIZES][TX_TYPES] = {
+const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES] = {
 #if CONFIG_CB4X4
-  { // TX_2X2
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-    { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-    { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors } },
+  {
+      // TX_2X2
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
 #endif
-  { // TX_4X4
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
-    { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
-    { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
-    { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors } },
-  { // TX_8X8
-    { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
-    { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
-    { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
-    { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors } },
-  { // TX_16X16
-    { default_scan_16x16, av1_default_iscan_16x16,
-      default_scan_16x16_neighbors },
-    { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
-    { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
-    { default_scan_16x16, av1_default_iscan_16x16,
-      default_scan_16x16_neighbors } },
-  { // TX_32X32
-    { default_scan_32x32, av1_default_iscan_32x32,
-      default_scan_32x32_neighbors },
-    { default_scan_32x32, av1_default_iscan_32x32,
-      default_scan_32x32_neighbors },
-    { default_scan_32x32, av1_default_iscan_32x32,
-      default_scan_32x32_neighbors },
-    { default_scan_32x32, av1_default_iscan_32x32,
-      default_scan_32x32_neighbors } },
+  {
+      // TX_4X4
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+      { row_scan_4x4, av1_row_iscan_4x4, row_scan_4x4_neighbors },
+      { col_scan_4x4, av1_col_iscan_4x4, col_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X8
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+      { row_scan_8x8, av1_row_iscan_8x8, row_scan_8x8_neighbors },
+      { col_scan_8x8, av1_col_iscan_8x8, col_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X16
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+      { row_scan_16x16, av1_row_iscan_16x16, row_scan_16x16_neighbors },
+      { col_scan_16x16, av1_col_iscan_16x16, col_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X32
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  }
+};
+
+const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+#if CONFIG_CB4X4
+  {
+      // TX_2X2
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+#endif
+  {
+      // TX_4X4
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+      { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+      { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X8
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+      { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+      { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X16
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { default_scan_16x16, av1_default_iscan_16x16,
+        default_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+      { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+      { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X32
+      { default_scan_32x32, av1_default_iscan_32x32,
+        default_scan_32x32_neighbors },
+#if CONFIG_EXT_TX
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { h2_scan_32x32, av1_h2_iscan_32x32, h2_scan_32x32_neighbors },
+      { v2_scan_32x32, av1_v2_iscan_32x32, v2_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { qtr_scan_32x32, av1_qtr_iscan_32x32, qtr_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+      { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+      { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_4X8
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+      { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+      { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X4
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+      { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+      { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_8X16
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { default_scan_8x16, av1_default_iscan_8x16,
+        default_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+      { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+      { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X8
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { default_scan_16x8, av1_default_iscan_16x8,
+        default_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+      { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+      { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_16X32
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { default_scan_16x32, av1_default_iscan_16x32,
+        default_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+      { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+      { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+#endif  // CONFIG_EXT_TX
+  },
+  {
+      // TX_32X16
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+#if CONFIG_EXT_TX
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { default_scan_32x16, av1_default_iscan_32x16,
+        default_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+      { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+      { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+#endif  // CONFIG_EXT_TX
+  }
 };
 
 #if CONFIG_ADAPT_SCAN
@@ -875,7 +4359,7 @@
 
 void av1_update_neighbors(int tx_size, const int16_t *scan,
                           const int16_t *iscan, int16_t *neighbors) {
-  const int tx1d_size = tx_size_1d[tx_size];
+  const int tx1d_size = tx_size_wide[tx_size];
   const int tx2d_size = tx_size_2d[tx_size];
   int scan_idx;
   for (scan_idx = 0; scan_idx < tx2d_size; ++scan_idx) {
@@ -906,7 +4390,7 @@
 void av1_update_sort_order(TX_SIZE tx_size, const uint32_t *non_zero_prob,
                            int16_t *sort_order) {
   uint32_t temp[COEFF_IDX_SIZE];
-  const int tx1d_size = tx_size_1d[tx_size];
+  const int tx1d_size = tx_size_wide[tx_size];
   const int tx2d_size = tx_size_2d[tx_size];
   int sort_idx;
   assert(tx2d_size <= COEFF_IDX_SIZE);
@@ -924,7 +4408,7 @@
   int coeff_idx;
   int scan_idx;
   int sort_idx;
-  const int tx1d_size = tx_size_1d[tx_size];
+  const int tx1d_size = tx_size_wide[tx_size];
   const int tx2d_size = tx_size_2d[tx_size];
 
   for (coeff_idx = 0; coeff_idx < tx2d_size; ++coeff_idx) {
@@ -945,8 +4429,7 @@
   int16_t *scan = get_adapt_scan(cm->fc, tx_size, tx_type);
   int16_t *iscan = get_adapt_iscan(cm->fc, tx_size, tx_type);
   int16_t *nb = get_adapt_nb(cm->fc, tx_size, tx_type);
-  const int tx2d_size = tx_size_2d[tx_size];
-  assert(tx2d_size <= 1024);
+  assert(tx_size_2d[tx_size] <= 1024);
   av1_update_sort_order(tx_size, non_zero_prob, sort_order);
   av1_update_scan_order(tx_size, sort_order, scan, iscan);
   av1_update_neighbors(tx_size, scan, iscan, nb);
@@ -972,4 +4455,4 @@
   }
 }
 
-#endif
+#endif  // CONFIG_ADAPT_SCAN

diff --git a/av1/common/scan.h b/av1/common/scan.h
index eeecae0..01f37b6 100644
--- a/av1/common/scan.h
+++ b/av1/common/scan.h

@@ -15,10 +15,9 @@
 #include "aom/aom_integer.h"
 #include "aom_ports/mem.h"
 
-#include "av1/common/blockd.h"
-#include "av1/common/entropymode.h"
 #include "av1/common/enums.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/blockd.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,7 +26,8 @@
 #define MAX_NEIGHBORS 2
 
 extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
-extern const SCAN_ORDER av1_scan_orders[TX_SIZES][TX_TYPES];
+extern const SCAN_ORDER av1_intra_scan_orders[TX_SIZES][TX_TYPES];
+extern const SCAN_ORDER av1_inter_scan_orders[TX_SIZES_ALL][TX_TYPES];
 
 #if CONFIG_ADAPT_SCAN
 void av1_update_scan_prob(AV1_COMMON *cm, TX_SIZE tx_size, TX_TYPE tx_type,
@@ -67,13 +67,21 @@
          1;
 }
 
-static INLINE const SCAN_ORDER *get_scan(const AV1_COMMON *const cm,
-                                         TX_SIZE tx_size, TX_TYPE tx_type) {
+static INLINE const SCAN_ORDER *get_scan(const AV1_COMMON *cm, TX_SIZE tx_size,
+                                         TX_TYPE tx_type, int is_inter) {
 #if CONFIG_ADAPT_SCAN
+  (void)is_inter;
   return &cm->fc->sc[tx_size][tx_type];
-#endif
+#else  // CONFIG_ADAPT_SCAN
   (void)cm;
-  return &av1_scan_orders[tx_size][tx_type];
+#if CONFIG_EXT_TX || CONFIG_VAR_TX
+  return is_inter ? &av1_inter_scan_orders[tx_size][tx_type]
+                  : &av1_intra_scan_orders[tx_size][tx_type];
+#else
+  (void)is_inter;
+  return &av1_intra_scan_orders[tx_size][tx_type];
+#endif  // CONFIG_EXT_TX
+#endif  // CONFIG_ADAPT_SCAN
 }
 
 #ifdef __cplusplus

diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index 88466ba..5aa9198 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c

@@ -85,6 +85,7 @@
 #endif  // CONFIG_MULTITHREAD
 }
 
+#if !CONFIG_EXT_PARTITION_TYPES
 static INLINE enum lf_path get_loop_filter_path(
     int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) {
   if (y_only)
@@ -140,21 +141,23 @@
     }
   }
 }
-
+#endif
 // Row-based multi-threaded loopfilter hook
 #if CONFIG_PARALLEL_DEBLOCKING
 static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync,
                                       LFWorkerData *const lf_data) {
   const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
   int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
-
+#endif
   for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * MAX_MIB_SIZE) {
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
     MODE_INFO **const mi =
         lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
 
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
       LOOP_FILTER_MASK lfm;
       int plane;
 
@@ -163,9 +166,16 @@
       av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
                      lf_data->cm->mi_stride, &lfm);
 
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+#else
+
       for (plane = 0; plane < num_planes; ++plane)
         loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane,
                                     mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
     }
   }
   return 1;
@@ -175,18 +185,21 @@
                                       LFWorkerData *const lf_data) {
   const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
   const int sb_cols =
-      mi_cols_aligned_to_sb(lf_data->cm->mi_cols) >> MAX_MIB_SIZE_LOG2;
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
   int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif
 
   for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * MAX_MIB_SIZE) {
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
     MODE_INFO **const mi =
         lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
 
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      const int r = mi_row >> MAX_MIB_SIZE_LOG2;
-      const int c = mi_col >> MAX_MIB_SIZE_LOG2;
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
       LOOP_FILTER_MASK lfm;
       int plane;
 
@@ -198,39 +211,64 @@
                            mi_col);
       av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
                      lf_data->cm->mi_stride, &lfm);
-
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+#else
       for (plane = 0; plane < num_planes; ++plane)
         loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
                                     mi + mi_col, mi_row, mi_col, path, &lfm);
+#endif
       sync_write(lf_sync, r, c, sb_cols);
     }
   }
   return 1;
 }
-#else   //  CONFIG_PARALLEL_DEBLOCKING
+#else  //  CONFIG_PARALLEL_DEBLOCKING
 static int loop_filter_row_worker(AV1LfSync *const lf_sync,
                                   LFWorkerData *const lf_data) {
   const int num_planes = lf_data->y_only ? 1 : MAX_MB_PLANE;
   const int sb_cols =
-      mi_cols_aligned_to_sb(lf_data->cm->mi_cols) >> MAX_MIB_SIZE_LOG2;
+      mi_cols_aligned_to_sb(lf_data->cm) >> lf_data->cm->mib_size_log2;
   int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path = get_loop_filter_path(lf_data->y_only, lf_data->planes);
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_EXT_PARTITION
+  printf(
+      "STOPPING: This code has not been modified to work with the "
+      "extended coding unit size experiment");
+  exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
 
   for (mi_row = lf_data->start; mi_row < lf_data->stop;
-       mi_row += lf_sync->num_workers * MAX_MIB_SIZE) {
+       mi_row += lf_sync->num_workers * lf_data->cm->mib_size) {
     MODE_INFO **const mi =
         lf_data->cm->mi_grid_visible + mi_row * lf_data->cm->mi_stride;
 
-    for (mi_col = 0; mi_col < lf_data->cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      const int r = mi_row >> MAX_MIB_SIZE_LOG2;
-      const int c = mi_col >> MAX_MIB_SIZE_LOG2;
+    for (mi_col = 0; mi_col < lf_data->cm->mi_cols;
+         mi_col += lf_data->cm->mib_size) {
+      const int r = mi_row >> lf_data->cm->mib_size_log2;
+      const int c = mi_col >> lf_data->cm->mib_size_log2;
+#if !CONFIG_EXT_PARTITION_TYPES
       LOOP_FILTER_MASK lfm;
+#endif
       int plane;
 
       sync_read(lf_sync, r, c);
 
       av1_setup_dst_planes(lf_data->planes, lf_data->frame_buffer, mi_row,
                            mi_col);
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane) {
+        av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+        av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane],
+                                          mi + mi_col, mi_row, mi_col);
+      }
+#else
       av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col,
                      lf_data->cm->mi_stride, &lfm);
 
@@ -240,7 +278,7 @@
         loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane,
                                     mi + mi_col, mi_row, mi_col, path, &lfm);
       }
-
+#endif  // CONFIG_EXT_PARTITION_TYPES
       sync_write(lf_sync, r, c, sb_cols);
     }
   }
@@ -255,13 +293,20 @@
                                 AV1LfSync *lf_sync) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   // Number of superblock rows and cols
-  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MAX_MIB_SIZE_LOG2;
+  const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2;
   // Decoder may allocate more threads than number of tiles based on user's
   // input.
-  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_cols = cm->tile_cols;
   const int num_workers = AOMMIN(nworkers, tile_cols);
   int i;
 
+#if CONFIG_EXT_PARTITION
+  printf(
+      "STOPPING: This code has not been modified to work with the "
+      "extended coding unit size experiment");
+  exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
+
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
       num_workers > lf_sync->num_workers) {
     av1_loop_filter_dealloc(lf_sync);
@@ -292,7 +337,7 @@
 
     // Loopfilter data
     av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * MAX_MIB_SIZE;
+    lf_data->start = start + i * cm->mib_size;
     lf_data->stop = stop;
     lf_data->y_only = y_only;
 
@@ -321,7 +366,7 @@
 
     // Loopfilter data
     av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * MAX_MIB_SIZE;
+    lf_data->start = start + i * cm->mib_size;
     lf_data->stop = stop;
     lf_data->y_only = y_only;
 
@@ -351,7 +396,7 @@
 
     // Loopfilter data
     av1_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * MAX_MIB_SIZE;
+    lf_data->start = start + i * cm->mib_size;
     lf_data->stop = stop;
     lf_data->y_only = y_only;
 
@@ -471,204 +516,21 @@
   }
 }
 
-// Accumulate frame counts.
-void av1_accumulate_frame_counts(AV1_COMMON *cm, FRAME_COUNTS *counts,
-                                 int is_dec) {
-  int i, j, k, l, m;
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(AV1_COMMON *cm, FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int *)&cm->counts;
+  const unsigned int *const cnt = (unsigned int *)counts;
 
-  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    for (j = 0; j < INTRA_MODES; j++)
-      cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+  const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+  unsigned int i;
 
-  for (i = 0; i < INTRA_MODES; i++)
-    for (j = 0; j < INTRA_MODES; j++)
-      cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
-
-  for (i = 0; i < PARTITION_CONTEXTS; i++)
-    for (j = 0; j < PARTITION_TYPES; j++)
-      cm->counts.partition[i][j] += counts->partition[i][j];
-
-  if (is_dec) {
-    int n;
-    for (i = 0; i < TX_SIZES; i++)
-      for (j = 0; j < PLANE_TYPES; j++)
-        for (k = 0; k < REF_TYPES; k++)
-          for (l = 0; l < COEF_BANDS; l++)
-            for (m = 0; m < COEFF_CONTEXTS; m++) {
-              cm->counts.eob_branch[i][j][k][l][m] +=
-                  counts->eob_branch[i][j][k][l][m];
-              for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
-                cm->counts.coef[i][j][k][l][m][n] +=
-                    counts->coef[i][j][k][l][m][n];
-            }
-  } else {
-    for (i = 0; i < TX_SIZES; i++)
-      for (j = 0; j < PLANE_TYPES; j++)
-        for (k = 0; k < REF_TYPES; k++)
-          for (l = 0; l < COEF_BANDS; l++)
-            for (m = 0; m < COEFF_CONTEXTS; m++)
-              cm->counts.eob_branch[i][j][k][l][m] +=
-                  counts->eob_branch[i][j][k][l][m];
-    // In the encoder, cm->counts.coef is only updated at frame
-    // level, so not need to accumulate it here.
-    // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
-    //   cm->counts.coef[i][j][k][l][m][n] +=
-    //       counts->coef[i][j][k][l][m][n];
-  }
-
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    for (j = 0; j < SWITCHABLE_FILTERS; j++)
-      cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
-
-#if CONFIG_REF_MV
-  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
-    for (j = 0; j < 2; ++j)
-      cm->counts.newmv_mode[i][j] += counts->newmv_mode[i][j];
-
-  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
-    for (j = 0; j < 2; ++j)
-      cm->counts.zeromv_mode[i][j] += counts->zeromv_mode[i][j];
-
-  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
-    for (j = 0; j < 2; ++j)
-      cm->counts.refmv_mode[i][j] += counts->refmv_mode[i][j];
-
-  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
-    for (j = 0; j < 2; ++j) cm->counts.drl_mode[i][j] += counts->drl_mode[i][j];
-#endif
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    for (j = 0; j < INTER_MODES; j++)
-      cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
-
-#if CONFIG_MOTION_VAR
-  for (i = 0; i < BLOCK_SIZES; ++i)
-    for (j = 0; j < MOTION_MODES; ++j)
-      cm->counts.motion_mode[i][j] += counts->motion_mode[i][j];
-#endif  // CONFIG_MOTION_VAR
-
-  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
-
-  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
-
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (SINGLE_REFS - 1); j++)
-      for (k = 0; k < 2; k++)
-        cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
-
-#if CONFIG_EXT_REFS
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (FWD_REFS - 1); j++)
-      for (k = 0; k < 2; k++)
-        cm->counts.comp_fwdref[i][j][k] += counts->comp_fwdref[i][j][k];
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < (BWD_REFS - 1); j++)
-      for (k = 0; k < 2; k++)
-        cm->counts.comp_bwdref[i][j][k] += counts->comp_bwdref[i][j][k];
-#else
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < 2; j++) cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
-#endif  // CONFIG_EXT_REFS
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = TX_4X4; j < TX_SIZES; j++)
-      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
-
-    for (j = TX_4X4; j < TX_SIZES - 1; j++)
-      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
-
-    for (j = TX_4X4; j < TX_SIZES - 2; j++)
-      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
-  }
-
-  for (i = TX_4X4; i < TX_SIZES; i++)
-    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
-
-  for (i = 0; i < SKIP_CONTEXTS; i++)
-    for (j = 0; j < 2; j++) cm->counts.skip[i][j] += counts->skip[i][j];
-
-#if CONFIG_REF_MV
-  for (m = 0; m < NMV_CONTEXTS; ++m) {
-    for (i = 0; i < MV_JOINTS; i++)
-      cm->counts.mv[m].joints[i] += counts->mv[m].joints[i];
-
-    for (k = 0; k < 2; k++) {
-      nmv_component_counts *comps = &cm->counts.mv[m].comps[k];
-      nmv_component_counts *comps_t = &counts->mv[m].comps[k];
-
-      for (i = 0; i < 2; i++) {
-        comps->sign[i] += comps_t->sign[i];
-        comps->class0_hp[i] += comps_t->class0_hp[i];
-        comps->hp[i] += comps_t->hp[i];
-      }
-
-      for (i = 0; i < MV_CLASSES; i++) comps->classes[i] += comps_t->classes[i];
-
-      for (i = 0; i < CLASS0_SIZE; i++) {
-        comps->class0[i] += comps_t->class0[i];
-        for (j = 0; j < MV_FP_SIZE; j++)
-          comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
-      }
-
-      for (i = 0; i < MV_OFFSET_BITS; i++)
-        for (j = 0; j < 2; j++) comps->bits[i][j] += comps_t->bits[i][j];
-
-      for (i = 0; i < MV_FP_SIZE; i++) comps->fp[i] += comps_t->fp[i];
-    }
-  }
-#else
-  for (i = 0; i < MV_JOINTS; i++)
-    cm->counts.mv.joints[i] += counts->mv.joints[i];
-
-  for (k = 0; k < 2; k++) {
-    nmv_component_counts *comps = &cm->counts.mv.comps[k];
-    nmv_component_counts *comps_t = &counts->mv.comps[k];
-
-    for (i = 0; i < 2; i++) {
-      comps->sign[i] += comps_t->sign[i];
-      comps->class0_hp[i] += comps_t->class0_hp[i];
-      comps->hp[i] += comps_t->hp[i];
-    }
-
-    for (i = 0; i < MV_CLASSES; i++) comps->classes[i] += comps_t->classes[i];
-
-    for (i = 0; i < CLASS0_SIZE; i++) {
-      comps->class0[i] += comps_t->class0[i];
-      for (j = 0; j < MV_FP_SIZE; j++)
-        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
-    }
-
-    for (i = 0; i < MV_OFFSET_BITS; i++)
-      for (j = 0; j < 2; j++) comps->bits[i][j] += comps_t->bits[i][j];
-
-    for (i = 0; i < MV_FP_SIZE; i++) comps->fp[i] += comps_t->fp[i];
-  }
-#endif
-
-  for (i = 0; i < EXT_TX_SIZES; i++) {
-    for (j = 0; j < TX_TYPES; ++j)
-      for (k = 0; k < TX_TYPES; k++)
-        cm->counts.intra_ext_tx[i][j][k] += counts->intra_ext_tx[i][j][k];
-  }
-  for (i = 0; i < EXT_TX_SIZES; i++) {
-    for (k = 0; k < TX_TYPES; k++)
-      cm->counts.inter_ext_tx[i][k] += counts->inter_ext_tx[i][k];
-  }
-
-  for (i = 0; i < PREDICTION_PROBS; i++)
-    for (j = 0; j < 2; j++) cm->counts.seg.pred[i][j] += counts->seg.pred[i][j];
-
-  for (i = 0; i < MAX_SEGMENTS; i++) {
-    cm->counts.seg.tree_total[i] += counts->seg.tree_total[i];
-    cm->counts.seg.tree_mispred[i] += counts->seg.tree_mispred[i];
-  }
+  for (i = 0; i < n_counts; i++) acc[i] += cnt[i];
 
 #if CONFIG_DELTA_Q
-  for (i = 0; i < DELTA_Q_CONTEXTS; i++)
+  for (i = 0; i < DELTA_Q_CONTEXTS; i++) {
+    int j;
     for (j = 0; j < 2; ++j) cm->counts.delta_q[i][j] += counts->delta_q[i][j];
+  }
 #endif
 }

diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index 8f33d27..eba37df 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h

@@ -55,7 +55,7 @@
                               int num_workers, AV1LfSync *lf_sync);
 
 void av1_accumulate_frame_counts(struct AV1Common *cm,
-                                 struct FRAME_COUNTS *counts, int is_dec);
+                                 struct FRAME_COUNTS *counts);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/common/tile_common.c b/av1/common/tile_common.c
index 0abf530..2c0e5c6 100644
--- a/av1/common/tile_common.c
+++ b/av1/common/tile_common.c

@@ -13,23 +13,14 @@
 #include "av1/common/onyxc_int.h"
 #include "aom_dsp/aom_dsp_common.h"
 
-#define MIN_TILE_WIDTH_B64 4
-#define MAX_TILE_WIDTH_B64 64
-
-static int get_tile_offset(int idx, int mis, int log2) {
-  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MAX_MIB_SIZE_LOG2;
-  const int offset = ((idx * sb_cols) >> log2) << MAX_MIB_SIZE_LOG2;
-  return AOMMIN(offset, mis);
-}
-
 void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
-  tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
-  tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_row_start = row * cm->tile_height;
+  tile->mi_row_end = AOMMIN(tile->mi_row_start + cm->tile_height, cm->mi_rows);
 }
 
 void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
-  tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
-  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
+  tile->mi_col_start = col * cm->tile_width;
+  tile->mi_col_end = AOMMIN(tile->mi_col_start + cm->tile_width, cm->mi_cols);
 }
 
 void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
@@ -37,22 +28,34 @@
   av1_tile_set_col(tile, cm, col);
 }
 
-static int get_min_log2_tile_cols(const int sb64_cols) {
+#if !CONFIG_EXT_TILE
+
+#if CONFIG_EXT_PARTITION
+#define MIN_TILE_WIDTH_MAX_SB 2
+#define MAX_TILE_WIDTH_MAX_SB 32
+#else
+#define MIN_TILE_WIDTH_MAX_SB 4
+#define MAX_TILE_WIDTH_MAX_SB 64
+#endif  // CONFIG_EXT_PARTITION
+
+static int get_min_log2_tile_cols(const int max_sb_cols) {
   int min_log2 = 0;
-  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols) ++min_log2;
+  while ((MAX_TILE_WIDTH_MAX_SB << min_log2) < max_sb_cols) ++min_log2;
   return min_log2;
 }
 
-static int get_max_log2_tile_cols(const int sb64_cols) {
+static int get_max_log2_tile_cols(const int max_sb_cols) {
   int max_log2 = 1;
-  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64) ++max_log2;
+  while ((max_sb_cols >> max_log2) >= MIN_TILE_WIDTH_MAX_SB) ++max_log2;
   return max_log2 - 1;
 }
 
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
+void av1_get_tile_n_bits(const int mi_cols, int *min_log2_tile_cols,
                          int *max_log2_tile_cols) {
-  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MAX_MIB_SIZE_LOG2;
-  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
-  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+  const int max_sb_cols =
+      ALIGN_POWER_OF_TWO(mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(max_sb_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(max_sb_cols);
   assert(*min_log2_tile_cols <= *max_log2_tile_cols);
 }
+#endif  // !CONFIG_EXT_TILE

diff --git a/av1/common/tile_common.h b/av1/common/tile_common.h
index a0164dc..d63d260 100644
--- a/av1/common/tile_common.h
+++ b/av1/common/tile_common.h

@@ -35,7 +35,7 @@
 void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
 void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
 
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
+void av1_get_tile_n_bits(const int mi_cols, int *min_log2_tile_cols,
                          int *max_log2_tile_cols);
 
 #ifdef __cplusplus

diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
new file mode 100644
index 0000000..7bbf20f
--- /dev/null
+++ b/av1/common/warped_motion.c

@@ -0,0 +1,1198 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be
+ *  found  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "av1/common/warped_motion.h"
+
+static ProjectPointsFunc get_project_points_type(TransformationType type) {
+  switch (type) {
+    case HOMOGRAPHY: return project_points_homography;
+    case AFFINE: return project_points_affine;
+    case ROTZOOM: return project_points_rotzoom;
+    case TRANSLATION: return project_points_translation;
+    default: assert(0); return NULL;
+  }
+}
+
+void project_points_translation(int32_t *mat, int *points, int *proj,
+                                const int n, const int stride_points,
+                                const int stride_proj, const int subsampling_x,
+                                const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((x * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[1]),
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((x * (1 << WARPEDMODEL_PREC_BITS)) + mat[1]), WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((y * (1 << (WARPEDMODEL_PREC_BITS + 1))) + mat[0]),
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((y * (1 << WARPEDMODEL_PREC_BITS))) + mat[0], WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_rotzoom(int32_t *mat, int *points, int *proj, const int n,
+                            const int stride_points, const int stride_proj,
+                            const int subsampling_x, const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
+              (mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[3] * x + mat[2] * y + mat[1],
+                                            WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          -mat[2] * 2 * x + mat[3] * 2 * y + mat[0] +
+              (-mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[2] * x + mat[3] * y + mat[0],
+                                            WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_affine(int32_t *mat, int *points, int *proj, const int n,
+                           const int stride_points, const int stride_proj,
+                           const int subsampling_x, const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[3] * 2 * x + mat[2] * 2 * y + mat[1] +
+              (mat[3] + mat[2] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[3] * x + mat[2] * y + mat[1],
+                                            WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[4] * 2 * x + mat[5] * 2 * y + mat[0] +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[4] * x + mat[5] * y + mat[0],
+                                            WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void project_points_homography(int32_t *mat, int *points, int *proj,
+                               const int n, const int stride_points,
+                               const int stride_proj, const int subsampling_x,
+                               const int subsampling_y) {
+  int i;
+  int64_t x, y, Z;
+  int64_t xp, yp;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    x = (subsampling_x ? 4 * x + 1 : 2 * x);
+    y = (subsampling_y ? 4 * y + 1 : 2 * y);
+
+    Z = (mat[7] * x + mat[6] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
+    xp = (mat[1] * x + mat[0] * y + 2 * mat[3]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+    yp = (mat[2] * x + mat[5] * y + 2 * mat[4]) *
+         (1 << (WARPEDPIXEL_PREC_BITS + WARPEDMODEL_ROW3HOMO_PREC_BITS -
+                WARPEDMODEL_PREC_BITS));
+
+    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
+    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
+
+    if (subsampling_x) xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    if (subsampling_y) yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    *(proj++) = xp;
+    *(proj++) = yp;
+
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static const int16_t
+    filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
+      { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
+      { 1, -3, 127, 4, -1, 0 },    { 1, -4, 126, 6, -2, 1 },
+      { 1, -5, 126, 8, -3, 1 },    { 1, -6, 125, 11, -4, 1 },
+      { 1, -7, 124, 13, -4, 1 },   { 2, -8, 123, 15, -5, 1 },
+      { 2, -9, 122, 18, -6, 1 },   { 2, -10, 121, 20, -6, 1 },
+      { 2, -11, 120, 22, -7, 2 },  { 2, -12, 119, 25, -8, 2 },
+      { 3, -13, 117, 27, -8, 2 },  { 3, -13, 116, 29, -9, 2 },
+      { 3, -14, 114, 32, -10, 3 }, { 3, -15, 113, 35, -10, 2 },
+      { 3, -15, 111, 37, -11, 3 }, { 3, -16, 109, 40, -11, 3 },
+      { 3, -16, 108, 42, -12, 3 }, { 4, -17, 106, 45, -13, 3 },
+      { 4, -17, 104, 47, -13, 3 }, { 4, -17, 102, 50, -14, 3 },
+      { 4, -17, 100, 52, -14, 3 }, { 4, -18, 98, 55, -15, 4 },
+      { 4, -18, 96, 58, -15, 3 },  { 4, -18, 94, 60, -16, 4 },
+      { 4, -18, 91, 63, -16, 4 },  { 4, -18, 89, 65, -16, 4 },
+      { 4, -18, 87, 68, -17, 4 },  { 4, -18, 85, 70, -17, 4 },
+      { 4, -18, 82, 73, -17, 4 },  { 4, -18, 80, 75, -17, 4 },
+      { 4, -18, 78, 78, -18, 4 },  { 4, -17, 75, 80, -18, 4 },
+      { 4, -17, 73, 82, -18, 4 },  { 4, -17, 70, 85, -18, 4 },
+      { 4, -17, 68, 87, -18, 4 },  { 4, -16, 65, 89, -18, 4 },
+      { 4, -16, 63, 91, -18, 4 },  { 4, -16, 60, 94, -18, 4 },
+      { 3, -15, 58, 96, -18, 4 },  { 4, -15, 55, 98, -18, 4 },
+      { 3, -14, 52, 100, -17, 4 }, { 3, -14, 50, 102, -17, 4 },
+      { 3, -13, 47, 104, -17, 4 }, { 3, -13, 45, 106, -17, 4 },
+      { 3, -12, 42, 108, -16, 3 }, { 3, -11, 40, 109, -16, 3 },
+      { 3, -11, 37, 111, -15, 3 }, { 2, -10, 35, 113, -15, 3 },
+      { 3, -10, 32, 114, -14, 3 }, { 2, -9, 29, 116, -13, 3 },
+      { 2, -8, 27, 117, -13, 3 },  { 2, -8, 25, 119, -12, 2 },
+      { 2, -7, 22, 120, -11, 2 },  { 1, -6, 20, 121, -10, 2 },
+      { 1, -6, 18, 122, -9, 2 },   { 1, -5, 15, 123, -8, 2 },
+      { 1, -4, 13, 124, -7, 1 },   { 1, -4, 11, 125, -6, 1 },
+      { 1, -3, 8, 126, -5, 1 },    { 1, -2, 6, 126, -4, 1 },
+      { 0, -1, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
+    };
+
+static int32_t do_ntap_filter(int32_t *p, int x) {
+  int i;
+  int32_t sum = 0;
+  for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
+    sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
+  }
+  return sum;
+}
+
+static int32_t do_cubic_filter(int32_t *p, int x) {
+  if (x == 0) {
+    return p[0] * (1 << WARPEDPIXEL_FILTER_BITS);
+  } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
+    return p[1] * (1 << WARPEDPIXEL_FILTER_BITS);
+  } else {
+    const int64_t v1 = (int64_t)x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
+    const int64_t v2 = x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
+    const int64_t v3 = x * (p[1] - p[-1]);
+    const int64_t v4 = 2 * p[0];
+    return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
+        (v4 * (1 << (3 * WARPEDPIXEL_PREC_BITS))) +
+            (v3 * (1 << (2 * WARPEDPIXEL_PREC_BITS))) +
+            (v2 * (1 << WARPEDPIXEL_PREC_BITS)) + v1,
+        3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
+  }
+}
+
+static INLINE void get_subcolumn(int taps, uint8_t *ref, int32_t *col,
+                                 int stride, int x, int y_start) {
+  int i;
+  for (i = 0; i < taps; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static uint8_t bi_ntap_filter(uint8_t *ref, int x, int y, int stride) {
+  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
+    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
+    get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
+                  i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
+                  j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
+    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t bi_cubic_filter(uint8_t *ref, int x, int y, int stride) {
+  int32_t val, arr[4];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < 4; ++k) {
+    int32_t arr_temp[4];
+    get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t bi_linear_filter(uint8_t *ref, int x, int y, int stride) {
+  const int ix = x >> WARPEDPIXEL_PREC_BITS;
+  const int iy = y >> WARPEDPIXEL_PREC_BITS;
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t val;
+  val = ROUND_POWER_OF_TWO_SIGNED(
+      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
+              (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
+          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[(iy + 1) * stride + ix + 1] * sy * sx,
+      WARPEDPIXEL_PREC_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t warp_interpolate(uint8_t *ref, int x, int y, int width,
+                                int height, int stride) {
+  int ix = x >> WARPEDPIXEL_PREC_BITS;
+  int iy = y >> WARPEDPIXEL_PREC_BITS;
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t v;
+
+  if (ix < 0 && iy < 0)
+    return ref[0];
+  else if (ix < 0 && iy > height - 1)
+    return ref[(height - 1) * stride];
+  else if (ix > width - 1 && iy < 0)
+    return ref[width - 1];
+  else if (ix > width - 1 && iy > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (ix < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (iy < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (ix > width - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride + width - 1] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (iy > height - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+            ref[(height - 1) * stride + ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
+             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
+    return bi_ntap_filter(ref, x, y, stride);
+  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
+    return bi_cubic_filter(ref, x, y, stride);
+  } else {
+    return bi_linear_filter(ref, x, y, stride);
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static INLINE void highbd_get_subcolumn(int taps, uint16_t *ref, int32_t *col,
+                                        int stride, int x, int y_start) {
+  int i;
+  for (i = 0; i < taps; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static uint16_t highbd_bi_ntap_filter(uint16_t *ref, int x, int y, int stride,
+                                      int bd) {
+  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
+    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
+    highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
+                         i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
+                         j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
+    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                            y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                       x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint16_t)clip_pixel_highbd(val, bd);
+}
+
+static uint16_t highbd_bi_cubic_filter(uint16_t *ref, int x, int y, int stride,
+                                       int bd) {
+  int32_t val, arr[4];
+  int k;
+  int i = (int)x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int)y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < 4; ++k) {
+    int32_t arr_temp[4];
+    highbd_get_subcolumn(4, ref, arr_temp, stride, i + k - 1, j - 1);
+    arr[k] =
+        do_cubic_filter(arr_temp + 1, y - (j * (1 << WARPEDPIXEL_PREC_BITS)));
+  }
+  val = do_cubic_filter(arr + 1, x - (i * (1 << WARPEDPIXEL_PREC_BITS)));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint16_t)clip_pixel_highbd(val, bd);
+}
+
+static uint16_t highbd_bi_linear_filter(uint16_t *ref, int x, int y, int stride,
+                                        int bd) {
+  const int ix = x >> WARPEDPIXEL_PREC_BITS;
+  const int iy = y >> WARPEDPIXEL_PREC_BITS;
+  const int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  const int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t val;
+  val = ROUND_POWER_OF_TWO_SIGNED(
+      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
+              (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
+          ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+          ref[(iy + 1) * stride + ix + 1] * sy * sx,
+      WARPEDPIXEL_PREC_BITS * 2);
+  return (uint16_t)clip_pixel_highbd(val, bd);
+}
+
+static uint16_t highbd_warp_interpolate(uint16_t *ref, int x, int y, int width,
+                                        int height, int stride, int bd) {
+  int ix = x >> WARPEDPIXEL_PREC_BITS;
+  int iy = y >> WARPEDPIXEL_PREC_BITS;
+  int sx = x - (ix * (1 << WARPEDPIXEL_PREC_BITS));
+  int sy = y - (iy * (1 << WARPEDPIXEL_PREC_BITS));
+  int32_t v;
+
+  if (ix < 0 && iy < 0)
+    return ref[0];
+  else if (ix < 0 && iy > height - 1)
+    return ref[(height - 1) * stride];
+  else if (ix > width - 1 && iy < 0)
+    return ref[width - 1];
+  else if (ix > width - 1 && iy > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (ix < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (iy < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) + ref[ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (ix > width - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+            ref[(iy + 1) * stride + width - 1] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (iy > height - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+            ref[(height - 1) * stride + ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel_highbd(v, bd);
+  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
+             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
+    return highbd_bi_ntap_filter(ref, x, y, stride, bd);
+  } else if (ix >= 1 && iy >= 1 && ix < width - 2 && iy < height - 2) {
+    return highbd_bi_cubic_filter(ref, x, y, stride, bd);
+  } else {
+    return highbd_bi_linear_filter(ref, x, y, stride, bd);
+  }
+}
+
+static double highbd_warp_erroradv(WarpedMotionParams *wm, uint8_t *ref8,
+                                   int width, int height, int stride,
+                                   uint8_t *dst8, int p_col, int p_row,
+                                   int p_width, int p_height, int p_stride,
+                                   int subsampling_x, int subsampling_y,
+                                   int x_scale, int y_scale, int bd) {
+  int i, j;
+  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  int gm_err = 0, no_gm_err = 0;
+  int64_t gm_sumerr = 0, no_gm_sumerr = 0;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      in[0] = j;
+      in[1] = i;
+      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      gm_err = dst[(j - p_col) + (i - p_row) * p_stride] -
+               highbd_warp_interpolate(ref, out[0], out[1], width, height,
+                                       stride, bd);
+      no_gm_err = dst[(j - p_col) + (i - p_row) * p_stride] -
+                  ref[(j - p_col) + (i - p_row) * stride];
+      gm_sumerr += (int64_t)gm_err * gm_err;
+      no_gm_sumerr += (int64_t)no_gm_err * no_gm_err;
+    }
+  }
+  return (double)gm_sumerr / no_gm_sumerr;
+}
+
+static void highbd_warp_plane(WarpedMotionParams *wm, uint8_t *ref8, int width,
+                              int height, int stride, uint8_t *pred8, int p_col,
+                              int p_row, int p_width, int p_height,
+                              int p_stride, int subsampling_x,
+                              int subsampling_y, int x_scale, int y_scale,
+                              int bd, int ref_frm) {
+  int i, j;
+  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  if (projectpoints == NULL) return;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      in[0] = j;
+      in[1] = i;
+      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      if (ref_frm)
+        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
+            pred[(j - p_col) + (i - p_row) * p_stride] +
+                highbd_warp_interpolate(ref, out[0], out[1], width, height,
+                                        stride, bd),
+            1);
+      else
+        pred[(j - p_col) + (i - p_row) * p_stride] = highbd_warp_interpolate(
+            ref, out[0], out[1], width, height, stride, bd);
+    }
+  }
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+static double warp_erroradv(WarpedMotionParams *wm, uint8_t *ref, int width,
+                            int height, int stride, uint8_t *dst, int p_col,
+                            int p_row, int p_width, int p_height, int p_stride,
+                            int subsampling_x, int subsampling_y, int x_scale,
+                            int y_scale) {
+  int gm_err = 0, no_gm_err = 0;
+  int gm_sumerr = 0, no_gm_sumerr = 0;
+  int i, j;
+  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      in[0] = j;
+      in[1] = i;
+      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      gm_err = dst[(j - p_col) + (i - p_row) * p_stride] -
+               warp_interpolate(ref, out[0], out[1], width, height, stride);
+      no_gm_err = dst[(j - p_col) + (i - p_row) * p_stride] -
+                  ref[(j - p_col) + (i - p_row) * stride];
+      gm_sumerr += gm_err * gm_err;
+      no_gm_sumerr += no_gm_err * no_gm_err;
+    }
+  }
+  return (double)gm_sumerr / no_gm_sumerr;
+}
+
+static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,
+                       int height, int stride, uint8_t *pred, int p_col,
+                       int p_row, int p_width, int p_height, int p_stride,
+                       int subsampling_x, int subsampling_y, int x_scale,
+                       int y_scale, int ref_frm) {
+  int i, j;
+  ProjectPointsFunc projectpoints = get_project_points_type(wm->wmtype);
+  if (projectpoints == NULL) return;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      in[0] = j;
+      in[1] = i;
+      projectpoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      if (ref_frm)
+        pred[(j - p_col) + (i - p_row) * p_stride] = ROUND_POWER_OF_TWO(
+            pred[(j - p_col) + (i - p_row) * p_stride] +
+                warp_interpolate(ref, out[0], out[1], width, height, stride),
+            1);
+      else
+        pred[(j - p_col) + (i - p_row) * p_stride] =
+            warp_interpolate(ref, out[0], out[1], width, height, stride);
+    }
+  }
+}
+
+double av1_warp_erroradv(WarpedMotionParams *wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                         int use_hbd, int bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                         uint8_t *ref, int width, int height, int stride,
+                         uint8_t *dst, int p_col, int p_row, int p_width,
+                         int p_height, int p_stride, int subsampling_x,
+                         int subsampling_y, int x_scale, int y_scale) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (use_hbd)
+    return highbd_warp_erroradv(
+        wm, ref, width, height, stride, dst, p_col, p_row, p_width, p_height,
+        p_stride, subsampling_x, subsampling_y, x_scale, y_scale, bd);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  return warp_erroradv(wm, ref, width, height, stride, dst, p_col, p_row,
+                       p_width, p_height, p_stride, subsampling_x,
+                       subsampling_y, x_scale, y_scale);
+}
+
+void av1_warp_plane(WarpedMotionParams *wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                    int use_hbd, int bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                    uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, int x_scale, int y_scale, int ref_frm) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (use_hbd)
+    highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
+                      p_width, p_height, p_stride, subsampling_x, subsampling_y,
+                      x_scale, y_scale, bd, ref_frm);
+  else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+               p_height, p_stride, subsampling_x, subsampling_y, x_scale,
+               y_scale, ref_frm);
+}
+
+void av1_integerize_model(const double *model, TransformationType wmtype,
+                          WarpedMotionParams *wm) {
+  wm->wmtype = wmtype;
+  switch (wmtype) {
+    case HOMOGRAPHY:
+      assert(fabs(model[8] - 1.0) < 1e-12);
+      wm->wmmat[6] =
+          (int32_t)lrint(model[6] * (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
+      wm->wmmat[7] =
+          (int32_t)lrint(model[7] * (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
+    /* fallthrough intended */
+    case AFFINE:
+      wm->wmmat[4] = (int32_t)lrint(model[4] * (1 << WARPEDMODEL_PREC_BITS));
+      wm->wmmat[5] = (int32_t)lrint(model[5] * (1 << WARPEDMODEL_PREC_BITS));
+    /* fallthrough intended */
+    case ROTZOOM:
+      wm->wmmat[2] = (int32_t)lrint(model[2] * (1 << WARPEDMODEL_PREC_BITS));
+      wm->wmmat[3] = (int32_t)lrint(model[3] * (1 << WARPEDMODEL_PREC_BITS));
+    /* fallthrough intended */
+    case TRANSLATION:
+      wm->wmmat[0] = (int32_t)lrint(model[0] * (1 << WARPEDMODEL_PREC_BITS));
+      wm->wmmat[1] = (int32_t)lrint(model[1] * (1 << WARPEDMODEL_PREC_BITS));
+      break;
+    default: assert(0 && "Invalid TransformationType");
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// svdcmp
+// Adopted from Numerical Recipes in C
+
+static const double TINY_NEAR_ZERO = 1.0E-12;
+
+static INLINE double sign(double a, double b) {
+  return ((b) >= 0 ? fabs(a) : -fabs(a));
+}
+
+static INLINE double pythag(double a, double b) {
+  double ct;
+  const double absa = fabs(a);
+  const double absb = fabs(b);
+
+  if (absa > absb) {
+    ct = absb / absa;
+    return absa * sqrt(1.0 + ct * ct);
+  } else {
+    ct = absa / absb;
+    return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
+  }
+}
+
+static void multiply_mat(const double *m1, const double *m2, double *res,
+                         const int m1_rows, const int inner_dim,
+                         const int m2_cols) {
+  double sum;
+
+  int row, col, inner;
+  for (row = 0; row < m1_rows; ++row) {
+    for (col = 0; col < m2_cols; ++col) {
+      sum = 0;
+      for (inner = 0; inner < inner_dim; ++inner)
+        sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+      *(res++) = sum;
+    }
+  }
+}
+
+static int svdcmp(double **u, int m, int n, double w[], double **v) {
+  const int max_its = 30;
+  int flag, i, its, j, jj, k, l, nm;
+  double anorm, c, f, g, h, s, scale, x, y, z;
+  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+  g = scale = anorm = 0.0;
+  for (i = 0; i < n; i++) {
+    l = i + 1;
+    rv1[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m) {
+      for (k = i; k < m; k++) scale += fabs(u[k][i]);
+      if (scale != 0.) {
+        for (k = i; k < m; k++) {
+          u[k][i] /= scale;
+          s += u[k][i] * u[k][i];
+        }
+        f = u[i][i];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][i] = f - g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
+          f = s / h;
+          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+        }
+        for (k = i; k < m; k++) u[k][i] *= scale;
+      }
+    }
+    w[i] = scale * g;
+    g = s = scale = 0.0;
+    if (i < m && i != n - 1) {
+      for (k = l; k < n; k++) scale += fabs(u[i][k]);
+      if (scale != 0.) {
+        for (k = l; k < n; k++) {
+          u[i][k] /= scale;
+          s += u[i][k] * u[i][k];
+        }
+        f = u[i][l];
+        g = -sign(sqrt(s), f);
+        h = f * g - s;
+        u[i][l] = f - g;
+        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+        for (j = l; j < m; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
+          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+        }
+        for (k = l; k < n; k++) u[i][k] *= scale;
+      }
+    }
+    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
+  }
+
+  for (i = n - 1; i >= 0; i--) {
+    if (i < n - 1) {
+      if (g != 0.) {
+        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
+        for (j = l; j < n; j++) {
+          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
+          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
+        }
+      }
+      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
+    }
+    v[i][i] = 1.0;
+    g = rv1[i];
+    l = i;
+  }
+  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
+    l = i + 1;
+    g = w[i];
+    for (j = l; j < n; j++) u[i][j] = 0.0;
+    if (g != 0.) {
+      g = 1.0 / g;
+      for (j = l; j < n; j++) {
+        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
+        f = (s / u[i][i]) * g;
+        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+      }
+      for (j = i; j < m; j++) u[j][i] *= g;
+    } else {
+      for (j = i; j < m; j++) u[j][i] = 0.0;
+    }
+    ++u[i][i];
+  }
+  for (k = n - 1; k >= 0; k--) {
+    for (its = 0; its < max_its; its++) {
+      flag = 1;
+      for (l = k; l >= 0; l--) {
+        nm = l - 1;
+        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
+          flag = 0;
+          break;
+        }
+        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
+      }
+      if (flag) {
+        c = 0.0;
+        s = 1.0;
+        for (i = l; i <= k; i++) {
+          f = s * rv1[i];
+          rv1[i] = c * rv1[i];
+          if ((double)(fabs(f) + anorm) == anorm) break;
+          g = w[i];
+          h = pythag(f, g);
+          w[i] = h;
+          h = 1.0 / h;
+          c = g * h;
+          s = -f * h;
+          for (j = 0; j < m; j++) {
+            y = u[j][nm];
+            z = u[j][i];
+            u[j][nm] = y * c + z * s;
+            u[j][i] = z * c - y * s;
+          }
+        }
+      }
+      z = w[k];
+      if (l == k) {
+        if (z < 0.0) {
+          w[k] = -z;
+          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
+        }
+        break;
+      }
+      if (its == max_its - 1) {
+        return 1;
+      }
+      assert(k > 0);
+      x = w[l];
+      nm = k - 1;
+      y = w[nm];
+      g = rv1[nm];
+      h = rv1[k];
+      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+      g = pythag(f, 1.0);
+      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+      c = s = 1.0;
+      for (j = l; j <= nm; j++) {
+        i = j + 1;
+        g = rv1[i];
+        y = w[i];
+        h = s * g;
+        g = c * g;
+        z = pythag(f, h);
+        rv1[j] = z;
+        c = f / z;
+        s = h / z;
+        f = x * c + g * s;
+        g = g * c - x * s;
+        h = y * s;
+        y *= c;
+        for (jj = 0; jj < n; jj++) {
+          x = v[jj][j];
+          z = v[jj][i];
+          v[jj][j] = x * c + z * s;
+          v[jj][i] = z * c - x * s;
+        }
+        z = pythag(f, h);
+        w[j] = z;
+        if (z != 0.) {
+          z = 1.0 / z;
+          c = f * z;
+          s = h * z;
+        }
+        f = c * g + s * y;
+        x = c * y - s * g;
+        for (jj = 0; jj < m; jj++) {
+          y = u[jj][j];
+          z = u[jj][i];
+          u[jj][j] = y * c + z * s;
+          u[jj][i] = z * c - y * s;
+        }
+      }
+      rv1[l] = 0.0;
+      rv1[k] = f;
+      w[k] = x;
+    }
+  }
+  aom_free(rv1);
+  return 0;
+}
+
+static int SVD(double *U, double *W, double *V, double *matx, int M, int N) {
+  // Assumes allocation for U is MxN
+  double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
+  double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
+  int problem, i;
+
+  problem = !(nrU && nrV);
+  if (!problem) {
+    for (i = 0; i < M; i++) {
+      nrU[i] = &U[i * N];
+    }
+    for (i = 0; i < N; i++) {
+      nrV[i] = &V[i * N];
+    }
+  } else {
+    if (nrU) aom_free(nrU);
+    if (nrV) aom_free(nrV);
+    return 1;
+  }
+
+  /* copy from given matx into nrU */
+  for (i = 0; i < M; i++) {
+    memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
+  }
+
+  /* HERE IT IS: do SVD */
+  if (svdcmp(nrU, M, N, W, nrV)) {
+    aom_free(nrU);
+    aom_free(nrV);
+    return 1;
+  }
+
+  /* aom_free Numerical Recipes arrays */
+  aom_free(nrU);
+  aom_free(nrV);
+
+  return 0;
+}
+
+int pseudo_inverse(double *inv, double *matx, const int M, const int N) {
+  double ans;
+  int i, j, k;
+  double *const U = (double *)aom_malloc(M * N * sizeof(*matx));
+  double *const W = (double *)aom_malloc(N * sizeof(*matx));
+  double *const V = (double *)aom_malloc(N * N * sizeof(*matx));
+
+  if (!(U && W && V)) {
+    return 1;
+  }
+  if (SVD(U, W, V, matx, M, N)) {
+    return 1;
+  }
+  for (i = 0; i < N; i++) {
+    if (fabs(W[i]) < TINY_NEAR_ZERO) {
+      return 1;
+    }
+  }
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j++) {
+      ans = 0;
+      for (k = 0; k < N; k++) {
+        ans += V[k + N * i] * U[k + N * j] / W[k];
+      }
+      inv[j + M * i] = ans;
+    }
+  }
+  aom_free(U);
+  aom_free(W);
+  aom_free(V);
+  return 0;
+}
+
+static void normalize_homography(double *pts, int n, double *T) {
+  // Assume the points are 2d coordinates with scale = 1
+  double *p = pts;
+  double mean[2] = { 0, 0 };
+  double msqe = 0;
+  double scale;
+  int i;
+  for (i = 0; i < n; ++i, p += 2) {
+    mean[0] += p[0];
+    mean[1] += p[1];
+  }
+  mean[0] /= n;
+  mean[1] /= n;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] -= mean[0];
+    p[1] -= mean[1];
+    msqe += sqrt(p[0] * p[0] + p[1] * p[1]);
+  }
+  msqe /= n;
+  scale = sqrt(2) / msqe;
+  T[0] = scale;
+  T[1] = 0;
+  T[2] = -scale * mean[0];
+  T[3] = 0;
+  T[4] = scale;
+  T[5] = -scale * mean[1];
+  T[6] = 0;
+  T[7] = 0;
+  T[8] = 1;
+  for (p = pts, i = 0; i < n; ++i, p += 2) {
+    p[0] *= scale;
+    p[1] *= scale;
+  }
+}
+
+static void invnormalize_mat(double *T, double *iT) {
+  double is = 1.0 / T[0];
+  double m0 = -T[2] * is;
+  double m1 = -T[5] * is;
+  iT[0] = is;
+  iT[1] = 0;
+  iT[2] = m0;
+  iT[3] = 0;
+  iT[4] = is;
+  iT[5] = m1;
+  iT[6] = 0;
+  iT[7] = 0;
+  iT[8] = 1;
+}
+
+static void denormalize_homography(double *params, double *T1, double *T2) {
+  double iT2[9];
+  double params2[9];
+  invnormalize_mat(T2, iT2);
+  multiply_mat(params, T1, params2, 3, 3, 3);
+  multiply_mat(iT2, params2, params, 3, 3, 3);
+}
+
+static void denormalize_affine(double *params, double *T1, double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[4];
+  params_denorm[3] = params[2];
+  params_denorm[4] = params[3];
+  params_denorm[5] = params[5];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[5];
+  params[1] = params_denorm[2];
+  params[2] = params_denorm[1];
+  params[3] = params_denorm[0];
+  params[4] = params_denorm[3];
+  params[5] = params_denorm[4];
+}
+
+static void denormalize_rotzoom(double *params, double *T1, double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = params[0];
+  params_denorm[1] = params[1];
+  params_denorm[2] = params[2];
+  params_denorm[3] = -params[1];
+  params_denorm[4] = params[0];
+  params_denorm[5] = params[3];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[5];
+  params[1] = params_denorm[2];
+  params[2] = params_denorm[1];
+  params[3] = params_denorm[0];
+}
+
+static void denormalize_translation(double *params, double *T1, double *T2) {
+  double params_denorm[MAX_PARAMDIM];
+  params_denorm[0] = 1;
+  params_denorm[1] = 0;
+  params_denorm[2] = params[0];
+  params_denorm[3] = 0;
+  params_denorm[4] = 1;
+  params_denorm[5] = params[1];
+  params_denorm[6] = params_denorm[7] = 0;
+  params_denorm[8] = 1;
+  denormalize_homography(params_denorm, T1, T2);
+  params[0] = params_denorm[5];
+  params[1] = params_denorm[2];
+}
+
+int find_translation(const int np, double *pts1, double *pts2, double *mat) {
+  int i;
+  double sx, sy, dx, dy;
+  double sumx, sumy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  sumx = 0;
+  sumy = 0;
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    sumx += dx - sx;
+    sumy += dy - sy;
+  }
+  mat[0] = sumx / np;
+  mat[1] = sumy / np;
+  denormalize_translation(mat, T1, T2);
+  return 0;
+}
+
+int find_rotzoom(const int np, double *pts1, double *pts2, double *mat) {
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * np2 * 9);
+  double *b = a + np2 * 4;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 4 + 0] = sx;
+    a[i * 2 * 4 + 1] = sy;
+    a[i * 2 * 4 + 2] = 1;
+    a[i * 2 * 4 + 3] = 0;
+    a[(i * 2 + 1) * 4 + 0] = sy;
+    a[(i * 2 + 1) * 4 + 1] = -sx;
+    a[(i * 2 + 1) * 4 + 2] = 0;
+    a[(i * 2 + 1) * 4 + 3] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (pseudo_inverse(temp, a, np2, 4)) {
+    aom_free(a);
+    return 1;
+  }
+  multiply_mat(temp, b, mat, 4, np2, 1);
+  denormalize_rotzoom(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+int find_affine(const int np, double *pts1, double *pts2, double *mat) {
+  const int np2 = np * 2;
+  double *a = (double *)aom_malloc(sizeof(*a) * np2 * 13);
+  double *b = a + np2 * 6;
+  double *temp = b + np2;
+  int i;
+  double sx, sy, dx, dy;
+
+  double T1[9], T2[9];
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 2 * 6 + 0] = sx;
+    a[i * 2 * 6 + 1] = sy;
+    a[i * 2 * 6 + 2] = 0;
+    a[i * 2 * 6 + 3] = 0;
+    a[i * 2 * 6 + 4] = 1;
+    a[i * 2 * 6 + 5] = 0;
+    a[(i * 2 + 1) * 6 + 0] = 0;
+    a[(i * 2 + 1) * 6 + 1] = 0;
+    a[(i * 2 + 1) * 6 + 2] = sx;
+    a[(i * 2 + 1) * 6 + 3] = sy;
+    a[(i * 2 + 1) * 6 + 4] = 0;
+    a[(i * 2 + 1) * 6 + 5] = 1;
+
+    b[2 * i] = dx;
+    b[2 * i + 1] = dy;
+  }
+  if (pseudo_inverse(temp, a, np2, 6)) {
+    aom_free(a);
+    return 1;
+  }
+  multiply_mat(temp, b, mat, 6, np2, 1);
+  denormalize_affine(mat, T1, T2);
+  aom_free(a);
+  return 0;
+}
+
+int find_homography(const int np, double *pts1, double *pts2, double *mat) {
+  // Implemented from Peter Kovesi's normalized implementation
+  const int np3 = np * 3;
+  double *a = (double *)aom_malloc(sizeof(*a) * np3 * 18);
+  double *U = a + np3 * 9;
+  double S[9], V[9 * 9];
+  int i, mini;
+  double sx, sy, dx, dy;
+  double T1[9], T2[9];
+
+  normalize_homography(pts1, np, T1);
+  normalize_homography(pts2, np, T2);
+
+  for (i = 0; i < np; ++i) {
+    dx = *(pts2++);
+    dy = *(pts2++);
+    sx = *(pts1++);
+    sy = *(pts1++);
+
+    a[i * 3 * 9 + 0] = a[i * 3 * 9 + 1] = a[i * 3 * 9 + 2] = 0;
+    a[i * 3 * 9 + 3] = -sx;
+    a[i * 3 * 9 + 4] = -sy;
+    a[i * 3 * 9 + 5] = -1;
+    a[i * 3 * 9 + 6] = dy * sx;
+    a[i * 3 * 9 + 7] = dy * sy;
+    a[i * 3 * 9 + 8] = dy;
+
+    a[(i * 3 + 1) * 9 + 0] = sx;
+    a[(i * 3 + 1) * 9 + 1] = sy;
+    a[(i * 3 + 1) * 9 + 2] = 1;
+    a[(i * 3 + 1) * 9 + 3] = a[(i * 3 + 1) * 9 + 4] = a[(i * 3 + 1) * 9 + 5] =
+        0;
+    a[(i * 3 + 1) * 9 + 6] = -dx * sx;
+    a[(i * 3 + 1) * 9 + 7] = -dx * sy;
+    a[(i * 3 + 1) * 9 + 8] = -dx;
+
+    a[(i * 3 + 2) * 9 + 0] = -dy * sx;
+    a[(i * 3 + 2) * 9 + 1] = -dy * sy;
+    a[(i * 3 + 2) * 9 + 2] = -dy;
+    a[(i * 3 + 2) * 9 + 3] = dx * sx;
+    a[(i * 3 + 2) * 9 + 4] = dx * sy;
+    a[(i * 3 + 2) * 9 + 5] = dx;
+    a[(i * 3 + 2) * 9 + 6] = a[(i * 3 + 2) * 9 + 7] = a[(i * 3 + 2) * 9 + 8] =
+        0;
+  }
+
+  if (SVD(U, S, V, a, np3, 9)) {
+    aom_free(a);
+    return 1;
+  } else {
+    double minS = 1e12;
+    mini = -1;
+    for (i = 0; i < 9; ++i) {
+      if (S[i] < minS) {
+        minS = S[i];
+        mini = i;
+      }
+    }
+  }
+
+  for (i = 0; i < 9; i++) mat[i] = V[i * 9 + mini];
+  denormalize_homography(mat, T1, T2);
+  aom_free(a);
+  if (mat[8] == 0.0) {
+    return 1;
+  }
+  return 0;
+}

diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
new file mode 100644
index 0000000..7c9919f
--- /dev/null
+++ b/av1/common/warped_motion.h

@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be
+ *  found  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_COMMON_WARPED_MOTION_H_
+#define AV1_COMMON_WARPED_MOTION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "./aom_config.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/mv.h"
+
+#define MAX_PARAMDIM 9
+
+typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj,
+                                  const int n, const int stride_points,
+                                  const int stride_proj,
+                                  const int subsampling_x,
+                                  const int subsampling_y);
+
+void project_points_translation(int32_t *mat, int *points, int *proj,
+                                const int n, const int stride_points,
+                                const int stride_proj, const int subsampling_x,
+                                const int subsampling_y);
+
+void project_points_rotzoom(int32_t *mat, int *points, int *proj, const int n,
+                            const int stride_points, const int stride_proj,
+                            const int subsampling_x, const int subsampling_y);
+
+void project_points_affine(int32_t *mat, int *points, int *proj, const int n,
+                           const int stride_points, const int stride_proj,
+                           const int subsampling_x, const int subsampling_y);
+
+void project_points_homography(int32_t *mat, int *points, int *proj,
+                               const int n, const int stride_points,
+                               const int stride_proj, const int subsampling_x,
+                               const int subsampling_y);
+
+double av1_warp_erroradv(WarpedMotionParams *wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                         int use_hbd, int bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                         uint8_t *ref, int width, int height, int stride,
+                         uint8_t *dst, int p_col, int p_row, int p_width,
+                         int p_height, int p_stride, int subsampling_x,
+                         int subsampling_y, int x_scale, int y_scale);
+
+void av1_warp_plane(WarpedMotionParams *wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                    int use_hbd, int bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                    uint8_t *ref, int width, int height, int stride,
+                    uint8_t *pred, int p_col, int p_row, int p_width,
+                    int p_height, int p_stride, int subsampling_x,
+                    int subsampling_y, int x_scale, int y_scale, int ref_frm);
+
+// Integerize model into the WarpedMotionParams structure
+void av1_integerize_model(const double *model, TransformationType wmtype,
+                          WarpedMotionParams *wm);
+
+int find_translation(const int np, double *pts1, double *pts2, double *mat);
+int find_rotzoom(const int np, double *pts1, double *pts2, double *mat);
+int find_affine(const int np, double *pts1, double *pts2, double *mat);
+int find_homography(const int np, double *pts1, double *pts2, double *mat);
+#endif  // AV1_COMMON_WARPED_MOTION_H_

diff --git a/av1/common/x86/av1_convolve_filters_ssse3.h b/av1/common/x86/av1_convolve_filters_ssse3.h
index ad92e01..b617831 100644
--- a/av1/common/x86/av1_convolve_filters_ssse3.h
+++ b/av1/common/x86/av1_convolve_filters_ssse3.h

@@ -8,134 +8,74 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
 #ifndef AV1_COMMON_X86_AV1_CONVOLVE_FILTERS_SSSE3_H_
 #define AV1_COMMON_X86_AV1_CONVOLVE_FILTERS_SSSE3_H_
 
 #include "./aom_config.h"
-#include "av1/common/filter.h"
 
 #if CONFIG_EXT_INTERP
 DECLARE_ALIGNED(16, static const int8_t,
                 sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
   {
-      {
-          0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0,
-      },
+      { 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, -1, 3, -6, 127, 8, -4, 2, -1, 0, 0, 0, 0 },
   },
   {
-      {
-          0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0,
-      },
+      { 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -2, 5, -12, 124, 18, -7, 3, -2, 0, 0, 0, 0 },
   },
   {
-      {
-          0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0,
-      },
+      { 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -3, 7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0 },
   },
   {
-      {
-          0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0,
-      },
+      { 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -4, 8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0 },
   },
   {
-      {
-          0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0,
-      },
+      { 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -4, 9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0 },
   },
   {
-      {
-          0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0,
-      },
+      { 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 2, -5, 10, -24, 99, 59, -20, 9, -4, 2, 0, 0, 0 },
   },
   {
-      {
-          0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0,
-      },
+      { 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 2, -5, 10, -24, 90, 70, -22, 10, -5, 2, 0, 0, 0 },
   },
   {
-      {
-          0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0,
-      },
+      { 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 2, -5, 10, -23, 80, 80, -23, 10, -5, 2, 0, 0, 0 },
   },
   {
-      {
-          0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0,
-      },
+      { 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 2, -5, 10, -22, 70, 90, -24, 10, -5, 2, 0, 0, 0 },
   },
   {
-      {
-          0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0,
-      },
+      { 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 2, -4, 9, -20, 59, 99, -24, 10, -5, 2, 0, 0, 0 },
   },
   {
-      {
-          0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0,
-      },
+      { 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -4, 8, -17, 49, 107, -22, 9, -4, 1, 0, 0, 0 },
   },
   {
-      {
-          0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0,
-      },
+      { 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -3, 7, -14, 38, 114, -20, 8, -4, 1, 0, 0, 0 },
   },
   {
-      {
-          0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0,
-      },
+      { 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -2, 5, -11, 28, 119, -17, 7, -3, 1, 0, 0, 0 },
   },
   {
-      {
-          0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0,
-      },
+      { 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, -2, 3, -7, 18, 124, -12, 5, -2, 1, 0, 0, 0 },
   },
   {
-      {
-          0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0,
-      },
+      { 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, -1, 2, -4, 8, 127, -6, 3, -1, 0, 0, 0, 0 },
   },
 };
 #endif
@@ -143,330 +83,150 @@
 DECLARE_ALIGNED(16, static const int8_t,
                 sub_pel_filters_10sharp_ver_signal_dir[15][6][16]) = {
   {
-      {
-          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6,
-          127,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1,
-      },
-      {
-          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6,
+        127 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5,
-      },
-      {
-          -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124,
-          -12, 124,
-      },
-      {
-          18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7,
-      },
-      {
-          3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2,
-      },
-      {
-          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5 },
+      { -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124,
+        -12, 124 },
+      { 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7 },
+      { 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7,
-      },
-      {
-          -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119,
-          -17, 119,
-      },
-      {
-          28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28,
-          -11,
-      },
-      {
-          5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7 },
+      { -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119,
+        -17, 119 },
+      { 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28,
+        -11 },
+      { 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114,
-          -20, 114,
-      },
-      {
-          38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38,
-          -14,
-      },
-      {
-          7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114,
+        -20, 114 },
+      { 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38,
+        -14 },
+      { 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9,
-      },
-      {
-          -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107,
-          -22, 107,
-      },
-      {
-          49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49,
-          -17,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9 },
+      { -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107,
+        -22, 107 },
+      { 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49,
+        -17 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24,
-          99,
-      },
-      {
-          59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59,
-          -20,
-      },
-      {
-          9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24,
+        99 },
+      { 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59,
+        -20 },
+      { 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4 },
+      { 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24,
-          90,
-      },
-      {
-          70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70,
-          -22,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24,
+        90 },
+      { 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70,
+        -22 },
+      { 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23,
-          80,
-      },
-      {
-          80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80,
-          -23,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23,
+        80 },
+      { 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80,
+        -23 },
+      { 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22,
-          70,
-      },
-      {
-          90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90,
-          -24,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22,
+        70 },
+      { 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90,
+        -24 },
+      { 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9,
-      },
-      {
-          -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20,
-          59,
-      },
-      {
-          99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99,
-          -24,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9 },
+      { -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20,
+        59 },
+      { 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99,
+        -24 },
+      { 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17,
-          49,
-      },
-      {
-          107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22,
-          107, -22,
-      },
-      {
-          9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17,
+        49 },
+      { 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22,
+        107, -22 },
+      { 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7,
-      },
-      {
-          -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14,
-          38,
-      },
-      {
-          114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20,
-          114, -20,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7 },
+      { -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14,
+        38 },
+      { 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20,
+        114, -20 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5,
-      },
-      {
-          -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11,
-          28,
-      },
-      {
-          119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17,
-          119, -17,
-      },
-      {
-          7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5 },
+      { -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11,
+        28 },
+      { 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17,
+        119, -17 },
+      { 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3,
-      },
-      {
-          -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18,
-      },
-      {
-          124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12,
-          124, -12,
-      },
-      {
-          5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3 },
+      { -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18 },
+      { 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12,
+        124, -12 },
+      { 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127,
-          -6,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1,
-      },
-      {
-          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127,
+        -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   },
 };
 #endif
@@ -474,124 +234,64 @@
 DECLARE_ALIGNED(16, static const int8_t,
                 sub_pel_filters_12sharp_signal_dir[15][2][16]) = {
   {
-      {
-          0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0,
-      },
+      { 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -2, 3, -7, 127, 8, -4, 2, -1, 1, 0, 0, 0 },
   },
   {
-      {
-          -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0,
-      },
+      { -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 2, -3, 6, -13, 124, 18, -8, 4, -2, 2, -1, 0, 0 },
   },
   {
-      {
-          -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0,
-      },
+      { -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -4, 8, -18, 120, 28, -12, 7, -4, 2, -1, 0, 0 },
   },
   {
-      {
-          -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0,
-      },
+      { -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -6, 10, -21, 115, 38, -15, 8, -5, 3, -1, 0, 0 },
   },
   {
-      {
-          -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0,
-      },
+      { -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0, 0, 0 },
+      { 0, 0, -2, 4, -6, 12, -24, 108, 49, -18, 10, -6, 3, -2, 0, 0 },
   },
   {
-      {
-          -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0,
-      },
+      { -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0, 0, 0 },
+      { 0, 0, -2, 4, -7, 13, -25, 100, 60, -21, 11, -7, 4, -2, 0, 0 },
   },
   {
-      {
-          -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0,
-      },
+      { -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0, 0, 0 },
+      { 0, 0, -2, 4, -7, 13, -26, 91, 71, -24, 13, -7, 4, -2, 0, 0 },
   },
   {
-      {
-          -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0,
-      },
+      { -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0, 0, 0 },
+      { 0, 0, -2, 4, -7, 13, -25, 81, 81, -25, 13, -7, 4, -2, 0, 0 },
   },
   {
-      {
-          -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0,
-      },
+      { -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0, 0, 0 },
+      { 0, 0, -2, 4, -7, 13, -24, 71, 91, -26, 13, -7, 4, -2, 0, 0 },
   },
   {
-      {
-          -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0,
-      },
+      { -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0, 0, 0 },
+      { 0, 0, -2, 4, -7, 11, -21, 60, 100, -25, 13, -7, 4, -2, 0, 0 },
   },
   {
-      {
-          -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0,
-      },
+      { -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0, 0, 0 },
+      { 0, 0, -2, 3, -6, 10, -18, 49, 108, -24, 12, -6, 4, -2, 0, 0 },
   },
   {
-      {
-          -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0,
-      },
+      { -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -5, 8, -15, 38, 115, -21, 10, -6, 3, -1, 0, 0 },
   },
   {
-      {
-          -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0,
-      },
+      { -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 2, -4, 7, -12, 28, 120, -18, 8, -4, 3, -1, 0, 0 },
   },
   {
-      {
-          -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0, 0, 0,
-      },
-      {
-          0, 0, -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0,
-      },
+      { -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 2, -2, 4, -8, 18, 124, -13, 6, -3, 2, -1, 0, 0 },
   },
   {
-      {
-          0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0,
-      },
+      { 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -1, 2, -4, 8, 127, -7, 3, -2, 1, 0, 0, 0 },
   },
 };
 #endif
@@ -599,330 +299,366 @@
 DECLARE_ALIGNED(16, static const int8_t,
                 sub_pel_filters_12sharp_ver_signal_dir[15][6][16]) = {
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3,
-      },
-      {
-          -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7,
-          127,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3 },
+      { -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7,
+        127 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6,
-      },
-      {
-          -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124,
-          -13, 124,
-      },
-      {
-          18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1,
-      },
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6 },
+      { -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124,
+        -13, 124 },
+      { 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
   },
   {
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120,
-          -18, 120,
-      },
-      {
-          28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28,
-          -12,
-      },
-      {
-          7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1,
-      },
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120,
+        -18, 120 },
+      { 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28,
+        -12 },
+      { 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
   },
   {
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10,
-      },
-      {
-          -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115,
-          -21, 115,
-      },
-      {
-          38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38,
-          -15,
-      },
-      {
-          8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1,
-      },
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10 },
+      { -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115,
+        -21, 115 },
+      { 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38,
+        -15 },
+      { 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12,
-      },
-      {
-          -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108,
-          -24, 108,
-      },
-      {
-          49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49,
-          -18,
-      },
-      {
-          10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6,
-      },
-      {
-          3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108,
+        -24, 108 },
+      { 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49,
+        -18 },
+      { 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6 },
+      { 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100,
-          -25, 100,
-      },
-      {
-          60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60,
-          -21,
-      },
-      {
-          11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100,
+        -25, 100 },
+      { 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60,
+        -21 },
+      { 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26,
-          91,
-      },
-      {
-          71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71,
-          -24,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26,
+        91 },
+      { 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71,
+        -24 },
+      { 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25,
-          81,
-      },
-      {
-          81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81,
-          -25,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25,
+        81 },
+      { 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81,
+        -25 },
+      { 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24,
-          71,
-      },
-      {
-          91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91,
-          -26,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24,
+        71 },
+      { 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91,
+        -26 },
+      { 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11,
-      },
-      {
-          -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21,
-          60,
-      },
-      {
-          100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25,
-          100, -25,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11 },
+      { -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21,
+        60 },
+      { 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25,
+        100, -25 },
+      { 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3,
-      },
-      {
-          -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10,
-      },
-      {
-          -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18,
-          49,
-      },
-      {
-          108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24,
-          108, -24,
-      },
-      {
-          12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3 },
+      { -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10 },
+      { -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18,
+        49 },
+      { 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24,
+        108, -24 },
+      { 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8,
-      },
-      {
-          -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15,
-          38,
-      },
-      {
-          115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21,
-          115, -21,
-      },
-      {
-          10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1,
-      },
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8 },
+      { -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15,
+        38 },
+      { 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21,
+        115, -21 },
+      { 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
   },
   {
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7,
-      },
-      {
-          -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12,
-          28,
-      },
-      {
-          120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18,
-          120, -18,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1,
-      },
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7 },
+      { -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12,
+        28 },
+      { 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18,
+        120, -18 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
   },
   {
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18,
-      },
-      {
-          124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13,
-          124, -13,
-      },
-      {
-          6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1,
-      },
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18 },
+      { 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13,
+        124, -13 },
+      { 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127,
-          -7,
-      },
-      {
-          3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127,
+        -7 },
+      { 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
+  },
+};
+#endif
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static const int8_t,
+                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]) = {
+  {
+      { 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -1, 3, -7, 127, 8, -4, 2, -1, 0, 0, 0, 0 },
+  },
+  {
+      { 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -3, 5, -12, 124, 18, -8, 4, -2, 1, 0, 0, 0 },
+  },
+  {
+      { -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 2, -4, 8, -17, 120, 28, -11, 6, -3, 1, -1, 0, 0 },
+  },
+  {
+      { -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 2, -4, 10, -21, 114, 38, -15, 8, -4, 2, -1, 0, 0 },
+  },
+  {
+      { -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -5, 11, -23, 107, 49, -18, 9, -5, 2, -1, 0, 0 },
+  },
+  {
+      { -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -6, 12, -25, 99, 60, -21, 11, -6, 3, -1, 0, 0 },
+  },
+  {
+      { -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -6, 12, -25, 90, 70, -23, 12, -6, 3, -1, 0, 0 },
+  },
+  {
+      { -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -6, 12, -24, 80, 80, -24, 12, -6, 3, -1, 0, 0 },
+  },
+  {
+      { -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -6, 12, -23, 70, 90, -25, 12, -6, 3, -1, 0, 0 },
+  },
+  {
+      { -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 3, -6, 11, -21, 60, 99, -25, 12, -6, 3, -1, 0, 0 },
+  },
+  {
+      { -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 2, -5, 9, -18, 49, 107, -23, 11, -5, 3, -1, 0, 0 },
+  },
+  {
+      { -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 2, -4, 8, -15, 38, 114, -21, 10, -4, 2, -1, 0, 0 },
+  },
+  {
+      { -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1, 0, 0, 0, 0 },
+      { 0, 0, -1, 1, -3, 6, -11, 28, 120, -17, 8, -4, 2, -1, 0, 0 },
+  },
+  {
+      { 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 1, -2, 4, -8, 18, 124, -12, 5, -3, 1, 0, 0, 0 },
+  },
+  {
+      { 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, -1, 2, -4, 8, 127, -7, 3, -1, 1, 0, 0, 0 },
+  },
+};
+#endif
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static const int8_t,
+                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]) = {
+  {
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7,
+        127 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  },
+  {
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5 },
+      { -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124,
+        -12, 124 },
+      { 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8 },
+      { 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120,
+        -17, 120 },
+      { 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28,
+        -11 },
+      { 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3 },
+      { 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10 },
+      { -21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114,
+        -21, 114 },
+      { 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38,
+        -15 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11 },
+      { -23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107,
+        -23, 107 },
+      { 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49,
+        -18 },
+      { 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25,
+        99 },
+      { 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60,
+        -21 },
+      { 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25,
+        90 },
+      { 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70,
+        -23 },
+      { 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24,
+        80 },
+      { 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80,
+        -24 },
+      { 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23,
+        70 },
+      { 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90,
+        -25 },
+      { 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11 },
+      { -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21,
+        60 },
+      { 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99,
+        -25 },
+      { 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9 },
+      { -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18,
+        49 },
+      { 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23,
+        107, -23 },
+      { 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15,
+        38 },
+      { 114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21,
+        114, -21 },
+      { 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1 },
+      { -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6 },
+      { -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11,
+        28 },
+      { 120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17,
+        120, -17 },
+      { 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18 },
+      { 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12,
+        124, -12 },
+      { 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
+  },
+  {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8 },
+      { 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127,
+        -7 },
+      { 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1 },
+      { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 },
   },
 };
 #endif

diff --git a/av1/common/x86/av1_convolve_ssse3.c b/av1/common/x86/av1_convolve_ssse3.c
index c52eebc..0db75c2 100644
--- a/av1/common/x86/av1_convolve_ssse3.c
+++ b/av1/common/x86/av1_convolve_ssse3.c

@@ -8,9 +8,11 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
 #include <assert.h>
 #include <tmmintrin.h>
 
+#include "./aom_config.h"
 #include "./av1_rtcd.h"
 #include "av1/common/filter.h"
 #include "av1/common/x86/av1_convolve_filters_ssse3.h"
@@ -21,7 +23,7 @@
 typedef const int8_t (*SubpelFilterCoeffs)[16];
 
 static INLINE SubpelFilterCoeffs
-av1_get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
+get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
 #if CONFIG_EXT_INTERP
   if (p.interp_filter == MULTITAP_SHARP2) {
     return &sub_pel_filters_12sharp_signal_dir[index][0];
@@ -30,13 +32,18 @@
     return &sub_pel_filters_10sharp_signal_dir[index][0];
   }
 #endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
+  }
+#endif
   (void)p;
   (void)index;
   return NULL;
 }
 
 static INLINE SubpelFilterCoeffs
-av1_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
+get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
 #if CONFIG_EXT_INTERP
   if (p.interp_filter == MULTITAP_SHARP2) {
     return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
@@ -45,6 +52,11 @@
     return &sub_pel_filters_10sharp_ver_signal_dir[index][0];
   }
 #endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
+  }
+#endif
   (void)p;
   (void)index;
   return NULL;
@@ -668,9 +680,8 @@
     return;
   }
 
-  hCoeffs = av1_get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
-  vCoeffs =
-      av1_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+  hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
+  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
 
   if (!hCoeffs || !vCoeffs) {
     av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
@@ -877,8 +888,7 @@
     return;
   }
 
-  vCoeffs =
-      av1_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+  vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
 
   if (!vCoeffs) {
     av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,

diff --git a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h b/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
deleted file mode 100644
index 876e579..0000000
--- a/av1/common/x86/av1_fwd_dct32x32_impl_sse2.h
+++ /dev/null

@@ -1,3202 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./av1_rtcd.h"
-#include "av1/common/av1_fwd_txfm.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-// TODO(jingning) The high bit-depth version needs re-work for performance.
-// The current SSE2 implementation also causes cross reference to the static
-// functions in the C implementation file.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-#if FDCT32x32_HIGH_PRECISION
-void av1_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    av1_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
-  }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rows_c
-#else
-void av1_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
-  int i, j;
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = intermediate[j * 32 + i];
-    av1_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-#define HIGH_FDCT32x32_2D_C av1_highbd_fdct32x32_rd_c
-#define HIGH_FDCT32x32_2D_ROWS_C av1_fdct32x32_rd_rows_c
-#endif  // FDCT32x32_HIGH_PRECISION
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif  // DCT_HIGH_BIT_DEPTH
-
-void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
-  // Calculate pre-multiplied strides
-  const int str1 = stride;
-  const int str2 = 2 * stride;
-  const int str3 = 2 * stride + str1;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
-  const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
-  const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
-  const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
-  const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
-  const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
-  const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
-  const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
-  const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  int pass;
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 32; column_start += 8) {
-      __m128i step1[32];
-      __m128i step2[32];
-      __m128i step3[32];
-      __m128i out[32];
-      // Stage 1
-      // Note: even though all the loads below are aligned, using the aligned
-      //       intrinsic make the code slightly slower.
-      if (0 == pass) {
-        const int16_t *in = &input[column_start];
-        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          const int16_t *ina = in + 0 * str1;
-          const int16_t *inb = in + 31 * str1;
-          __m128i *step1a = &step1[0];
-          __m128i *step1b = &step1[31];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 4 * str1;
-          const int16_t *inb = in + 27 * str1;
-          __m128i *step1a = &step1[4];
-          __m128i *step1b = &step1[27];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 8 * str1;
-          const int16_t *inb = in + 23 * str1;
-          __m128i *step1a = &step1[8];
-          __m128i *step1b = &step1[23];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-        {
-          const int16_t *ina = in + 12 * str1;
-          const int16_t *inb = in + 19 * str1;
-          __m128i *step1a = &step1[12];
-          __m128i *step1b = &step1[19];
-          const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
-          const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
-          const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
-          const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
-          const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
-          const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
-          const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
-          const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
-          step1a[0] = _mm_add_epi16(ina0, inb0);
-          step1a[1] = _mm_add_epi16(ina1, inb1);
-          step1a[2] = _mm_add_epi16(ina2, inb2);
-          step1a[3] = _mm_add_epi16(ina3, inb3);
-          step1b[-3] = _mm_sub_epi16(ina3, inb3);
-          step1b[-2] = _mm_sub_epi16(ina2, inb2);
-          step1b[-1] = _mm_sub_epi16(ina1, inb1);
-          step1b[-0] = _mm_sub_epi16(ina0, inb0);
-          step1a[0] = _mm_slli_epi16(step1a[0], 2);
-          step1a[1] = _mm_slli_epi16(step1a[1], 2);
-          step1a[2] = _mm_slli_epi16(step1a[2], 2);
-          step1a[3] = _mm_slli_epi16(step1a[3], 2);
-          step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
-          step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
-          step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
-          step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
-        }
-      } else {
-        int16_t *in = &intermediate[column_start];
-        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
-        // Note: using the same approach as above to have common offset is
-        //       counter-productive as all offsets can be calculated at compile
-        //       time.
-        // Note: the next four blocks could be in a loop. That would help the
-        //       instruction cache but is actually slower.
-        {
-          __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
-          __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
-          __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
-          __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
-          __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
-          __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
-          __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
-          __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[0] = ADD_EPI16(in00, in31);
-          step1[1] = ADD_EPI16(in01, in30);
-          step1[2] = ADD_EPI16(in02, in29);
-          step1[3] = ADD_EPI16(in03, in28);
-          step1[28] = SUB_EPI16(in03, in28);
-          step1[29] = SUB_EPI16(in02, in29);
-          step1[30] = SUB_EPI16(in01, in30);
-          step1[31] = SUB_EPI16(in00, in31);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
-                                             &step1[3], &step1[28], &step1[29],
-                                             &step1[30], &step1[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
-          __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
-          __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
-          __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
-          __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
-          __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
-          __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
-          __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[4] = ADD_EPI16(in04, in27);
-          step1[5] = ADD_EPI16(in05, in26);
-          step1[6] = ADD_EPI16(in06, in25);
-          step1[7] = ADD_EPI16(in07, in24);
-          step1[24] = SUB_EPI16(in07, in24);
-          step1[25] = SUB_EPI16(in06, in25);
-          step1[26] = SUB_EPI16(in05, in26);
-          step1[27] = SUB_EPI16(in04, in27);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
-                                             &step1[7], &step1[24], &step1[25],
-                                             &step1[26], &step1[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
-          __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
-          __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
-          __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
-          __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
-          __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
-          __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
-          __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[8] = ADD_EPI16(in08, in23);
-          step1[9] = ADD_EPI16(in09, in22);
-          step1[10] = ADD_EPI16(in10, in21);
-          step1[11] = ADD_EPI16(in11, in20);
-          step1[20] = SUB_EPI16(in11, in20);
-          step1[21] = SUB_EPI16(in10, in21);
-          step1[22] = SUB_EPI16(in09, in22);
-          step1[23] = SUB_EPI16(in08, in23);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
-                                             &step1[11], &step1[20], &step1[21],
-                                             &step1[22], &step1[23]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
-          __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
-          __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
-          __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
-          __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
-          __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
-          __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
-          __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = ADD_EPI16(in12, in19);
-          step1[13] = ADD_EPI16(in13, in18);
-          step1[14] = ADD_EPI16(in14, in17);
-          step1[15] = ADD_EPI16(in15, in16);
-          step1[16] = SUB_EPI16(in15, in16);
-          step1[17] = SUB_EPI16(in14, in17);
-          step1[18] = SUB_EPI16(in13, in18);
-          step1[19] = SUB_EPI16(in12, in19);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
-                                             &step1[15], &step1[16], &step1[17],
-                                             &step1[18], &step1[19]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Stage 2
-      {
-        step2[0] = ADD_EPI16(step1[0], step1[15]);
-        step2[1] = ADD_EPI16(step1[1], step1[14]);
-        step2[2] = ADD_EPI16(step1[2], step1[13]);
-        step2[3] = ADD_EPI16(step1[3], step1[12]);
-        step2[4] = ADD_EPI16(step1[4], step1[11]);
-        step2[5] = ADD_EPI16(step1[5], step1[10]);
-        step2[6] = ADD_EPI16(step1[6], step1[9]);
-        step2[7] = ADD_EPI16(step1[7], step1[8]);
-        step2[8] = SUB_EPI16(step1[7], step1[8]);
-        step2[9] = SUB_EPI16(step1[6], step1[9]);
-        step2[10] = SUB_EPI16(step1[5], step1[10]);
-        step2[11] = SUB_EPI16(step1[4], step1[11]);
-        step2[12] = SUB_EPI16(step1[3], step1[12]);
-        step2[13] = SUB_EPI16(step1[2], step1[13]);
-        step2[14] = SUB_EPI16(step1[1], step1[14]);
-        step2[15] = SUB_EPI16(step1[0], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x16(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      {
-        const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
-        const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
-        const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
-        const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
-        const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
-        const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
-        const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
-        const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
-        const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
-        const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
-        const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
-        const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
-        const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
-        const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
-        const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
-        const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
-        const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
-        const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
-        const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
-        const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
-        const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
-        const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
-        const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
-        const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
-        // dct_const_round_shift
-        const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
-        const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
-        const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
-        const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
-        const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
-        const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
-        const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
-        const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
-        const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
-        const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
-        const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
-        const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
-        const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
-        const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
-        const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
-        const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
-        const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
-        const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
-        // Combine
-        step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
-        step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
-        step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
-        step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
-        step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
-        step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
-        step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
-        step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
-                                           &step2[23], &step2[24], &step2[25],
-                                           &step2[26], &step2[27]);
-        if (overflow) {
-          if (pass == 0)
-            HIGH_FDCT32x32_2D_C(input, output_org, stride);
-          else
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-
-#if !FDCT32x32_HIGH_PRECISION
-      // dump the magnitude by half, hence the intermediate values are within
-      // the range of 16 bits.
-      if (1 == pass) {
-        __m128i s3_00_0 = _mm_cmplt_epi16(step2[0], kZero);
-        __m128i s3_01_0 = _mm_cmplt_epi16(step2[1], kZero);
-        __m128i s3_02_0 = _mm_cmplt_epi16(step2[2], kZero);
-        __m128i s3_03_0 = _mm_cmplt_epi16(step2[3], kZero);
-        __m128i s3_04_0 = _mm_cmplt_epi16(step2[4], kZero);
-        __m128i s3_05_0 = _mm_cmplt_epi16(step2[5], kZero);
-        __m128i s3_06_0 = _mm_cmplt_epi16(step2[6], kZero);
-        __m128i s3_07_0 = _mm_cmplt_epi16(step2[7], kZero);
-        __m128i s2_08_0 = _mm_cmplt_epi16(step2[8], kZero);
-        __m128i s2_09_0 = _mm_cmplt_epi16(step2[9], kZero);
-        __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
-        __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
-        __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
-        __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
-        __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
-        __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
-        __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
-        __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
-        __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
-        __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
-        __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
-        __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
-        __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
-        __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
-        __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
-        __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
-        __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
-        __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
-        __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
-        __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
-        __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
-        __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
-
-        step2[0] = SUB_EPI16(step2[0], s3_00_0);
-        step2[1] = SUB_EPI16(step2[1], s3_01_0);
-        step2[2] = SUB_EPI16(step2[2], s3_02_0);
-        step2[3] = SUB_EPI16(step2[3], s3_03_0);
-        step2[4] = SUB_EPI16(step2[4], s3_04_0);
-        step2[5] = SUB_EPI16(step2[5], s3_05_0);
-        step2[6] = SUB_EPI16(step2[6], s3_06_0);
-        step2[7] = SUB_EPI16(step2[7], s3_07_0);
-        step2[8] = SUB_EPI16(step2[8], s2_08_0);
-        step2[9] = SUB_EPI16(step2[9], s2_09_0);
-        step2[10] = SUB_EPI16(step2[10], s3_10_0);
-        step2[11] = SUB_EPI16(step2[11], s3_11_0);
-        step2[12] = SUB_EPI16(step2[12], s3_12_0);
-        step2[13] = SUB_EPI16(step2[13], s3_13_0);
-        step2[14] = SUB_EPI16(step2[14], s2_14_0);
-        step2[15] = SUB_EPI16(step2[15], s2_15_0);
-        step1[16] = SUB_EPI16(step1[16], s3_16_0);
-        step1[17] = SUB_EPI16(step1[17], s3_17_0);
-        step1[18] = SUB_EPI16(step1[18], s3_18_0);
-        step1[19] = SUB_EPI16(step1[19], s3_19_0);
-        step2[20] = SUB_EPI16(step2[20], s3_20_0);
-        step2[21] = SUB_EPI16(step2[21], s3_21_0);
-        step2[22] = SUB_EPI16(step2[22], s3_22_0);
-        step2[23] = SUB_EPI16(step2[23], s3_23_0);
-        step2[24] = SUB_EPI16(step2[24], s3_24_0);
-        step2[25] = SUB_EPI16(step2[25], s3_25_0);
-        step2[26] = SUB_EPI16(step2[26], s3_26_0);
-        step2[27] = SUB_EPI16(step2[27], s3_27_0);
-        step1[28] = SUB_EPI16(step1[28], s3_28_0);
-        step1[29] = SUB_EPI16(step1[29], s3_29_0);
-        step1[30] = SUB_EPI16(step1[30], s3_30_0);
-        step1[31] = SUB_EPI16(step1[31], s3_31_0);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x32(
-            &step2[0], &step2[1], &step2[2], &step2[3], &step2[4], &step2[5],
-            &step2[6], &step2[7], &step2[8], &step2[9], &step2[10], &step2[11],
-            &step2[12], &step2[13], &step2[14], &step2[15], &step1[16],
-            &step1[17], &step1[18], &step1[19], &step2[20], &step2[21],
-            &step2[22], &step2[23], &step2[24], &step2[25], &step2[26],
-            &step2[27], &step1[28], &step1[29], &step1[30], &step1[31]);
-        if (overflow) {
-          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        step2[0] = _mm_add_epi16(step2[0], kOne);
-        step2[1] = _mm_add_epi16(step2[1], kOne);
-        step2[2] = _mm_add_epi16(step2[2], kOne);
-        step2[3] = _mm_add_epi16(step2[3], kOne);
-        step2[4] = _mm_add_epi16(step2[4], kOne);
-        step2[5] = _mm_add_epi16(step2[5], kOne);
-        step2[6] = _mm_add_epi16(step2[6], kOne);
-        step2[7] = _mm_add_epi16(step2[7], kOne);
-        step2[8] = _mm_add_epi16(step2[8], kOne);
-        step2[9] = _mm_add_epi16(step2[9], kOne);
-        step2[10] = _mm_add_epi16(step2[10], kOne);
-        step2[11] = _mm_add_epi16(step2[11], kOne);
-        step2[12] = _mm_add_epi16(step2[12], kOne);
-        step2[13] = _mm_add_epi16(step2[13], kOne);
-        step2[14] = _mm_add_epi16(step2[14], kOne);
-        step2[15] = _mm_add_epi16(step2[15], kOne);
-        step1[16] = _mm_add_epi16(step1[16], kOne);
-        step1[17] = _mm_add_epi16(step1[17], kOne);
-        step1[18] = _mm_add_epi16(step1[18], kOne);
-        step1[19] = _mm_add_epi16(step1[19], kOne);
-        step2[20] = _mm_add_epi16(step2[20], kOne);
-        step2[21] = _mm_add_epi16(step2[21], kOne);
-        step2[22] = _mm_add_epi16(step2[22], kOne);
-        step2[23] = _mm_add_epi16(step2[23], kOne);
-        step2[24] = _mm_add_epi16(step2[24], kOne);
-        step2[25] = _mm_add_epi16(step2[25], kOne);
-        step2[26] = _mm_add_epi16(step2[26], kOne);
-        step2[27] = _mm_add_epi16(step2[27], kOne);
-        step1[28] = _mm_add_epi16(step1[28], kOne);
-        step1[29] = _mm_add_epi16(step1[29], kOne);
-        step1[30] = _mm_add_epi16(step1[30], kOne);
-        step1[31] = _mm_add_epi16(step1[31], kOne);
-
-        step2[0] = _mm_srai_epi16(step2[0], 2);
-        step2[1] = _mm_srai_epi16(step2[1], 2);
-        step2[2] = _mm_srai_epi16(step2[2], 2);
-        step2[3] = _mm_srai_epi16(step2[3], 2);
-        step2[4] = _mm_srai_epi16(step2[4], 2);
-        step2[5] = _mm_srai_epi16(step2[5], 2);
-        step2[6] = _mm_srai_epi16(step2[6], 2);
-        step2[7] = _mm_srai_epi16(step2[7], 2);
-        step2[8] = _mm_srai_epi16(step2[8], 2);
-        step2[9] = _mm_srai_epi16(step2[9], 2);
-        step2[10] = _mm_srai_epi16(step2[10], 2);
-        step2[11] = _mm_srai_epi16(step2[11], 2);
-        step2[12] = _mm_srai_epi16(step2[12], 2);
-        step2[13] = _mm_srai_epi16(step2[13], 2);
-        step2[14] = _mm_srai_epi16(step2[14], 2);
-        step2[15] = _mm_srai_epi16(step2[15], 2);
-        step1[16] = _mm_srai_epi16(step1[16], 2);
-        step1[17] = _mm_srai_epi16(step1[17], 2);
-        step1[18] = _mm_srai_epi16(step1[18], 2);
-        step1[19] = _mm_srai_epi16(step1[19], 2);
-        step2[20] = _mm_srai_epi16(step2[20], 2);
-        step2[21] = _mm_srai_epi16(step2[21], 2);
-        step2[22] = _mm_srai_epi16(step2[22], 2);
-        step2[23] = _mm_srai_epi16(step2[23], 2);
-        step2[24] = _mm_srai_epi16(step2[24], 2);
-        step2[25] = _mm_srai_epi16(step2[25], 2);
-        step2[26] = _mm_srai_epi16(step2[26], 2);
-        step2[27] = _mm_srai_epi16(step2[27], 2);
-        step1[28] = _mm_srai_epi16(step1[28], 2);
-        step1[29] = _mm_srai_epi16(step1[29], 2);
-        step1[30] = _mm_srai_epi16(step1[30], 2);
-        step1[31] = _mm_srai_epi16(step1[31], 2);
-      }
-#endif  // !FDCT32x32_HIGH_PRECISION
-
-#if FDCT32x32_HIGH_PRECISION
-      if (pass == 0) {
-#endif
-        // Stage 3
-        {
-          step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
-          step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
-          step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
-          step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
-          step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
-          step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
-          step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
-          step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
-                                             &step3[3], &step3[4], &step3[5],
-                                             &step3[6], &step3[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-          // Combine
-          step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
-          step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
-          step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
-          step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step3[10], &step3[11], &step3[12],
-                                             &step3[13]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[16] = ADD_EPI16(step2[23], step1[16]);
-          step3[17] = ADD_EPI16(step2[22], step1[17]);
-          step3[18] = ADD_EPI16(step2[21], step1[18]);
-          step3[19] = ADD_EPI16(step2[20], step1[19]);
-          step3[20] = SUB_EPI16(step1[19], step2[20]);
-          step3[21] = SUB_EPI16(step1[18], step2[21]);
-          step3[22] = SUB_EPI16(step1[17], step2[22]);
-          step3[23] = SUB_EPI16(step1[16], step2[23]);
-          step3[24] = SUB_EPI16(step1[31], step2[24]);
-          step3[25] = SUB_EPI16(step1[30], step2[25]);
-          step3[26] = SUB_EPI16(step1[29], step2[26]);
-          step3[27] = SUB_EPI16(step1[28], step2[27]);
-          step3[28] = ADD_EPI16(step2[27], step1[28]);
-          step3[29] = ADD_EPI16(step2[26], step1[29]);
-          step3[30] = ADD_EPI16(step2[25], step1[30]);
-          step3[31] = ADD_EPI16(step2[24], step1[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step3[16], &step3[17], &step3[18], &step3[19], &step3[20],
-              &step3[21], &step3[22], &step3[23], &step3[24], &step3[25],
-              &step3[26], &step3[27], &step3[28], &step3[29], &step3[30],
-              &step3[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-
-        // Stage 4
-        {
-          step1[0] = ADD_EPI16(step3[3], step3[0]);
-          step1[1] = ADD_EPI16(step3[2], step3[1]);
-          step1[2] = SUB_EPI16(step3[1], step3[2]);
-          step1[3] = SUB_EPI16(step3[0], step3[3]);
-          step1[8] = ADD_EPI16(step3[11], step2[8]);
-          step1[9] = ADD_EPI16(step3[10], step2[9]);
-          step1[10] = SUB_EPI16(step2[9], step3[10]);
-          step1[11] = SUB_EPI16(step2[8], step3[11]);
-          step1[12] = SUB_EPI16(step2[15], step3[12]);
-          step1[13] = SUB_EPI16(step2[14], step3[13]);
-          step1[14] = ADD_EPI16(step3[13], step2[14]);
-          step1[15] = ADD_EPI16(step3[12], step2[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[0], &step1[1], &step1[2], &step1[3], &step1[4], &step1[5],
-              &step1[6], &step1[7], &step1[8], &step1[9], &step1[10],
-              &step1[11], &step1[12], &step1[13], &step1[14], &step1[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
-          const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
-          const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
-          const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
-          const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
-          const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
-          const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
-          const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
-          const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
-          // Combine
-          step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
-          step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
-          const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
-          const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
-          const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
-          const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
-          const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
-          const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
-          const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
-          const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
-          const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
-          const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
-          const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
-          const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
-          const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
-          const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
-          const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
-          const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
-          const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
-          const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
-          const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
-          const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
-          const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
-          const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
-          const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
-          const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
-          const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
-          const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
-          const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
-          const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
-          const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
-          const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
-          const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
-          const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
-          const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
-          const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
-          const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
-          const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
-          const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
-          const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
-          // Combine
-          step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
-          step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
-          step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
-          step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
-          step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
-          step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
-          step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
-          step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
-                                             &step1[21], &step1[26], &step1[27],
-                                             &step1[28], &step1[29]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 5
-        {
-          step2[4] = ADD_EPI16(step1[5], step3[4]);
-          step2[5] = SUB_EPI16(step3[4], step1[5]);
-          step2[6] = SUB_EPI16(step3[7], step1[6]);
-          step2[7] = ADD_EPI16(step1[6], step3[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[4], &step2[5], &step2[6],
-                                             &step2[7]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
-          const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
-          const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
-          const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
-          const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
-          const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
-          const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
-          const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
-          const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
-          const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
-          const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
-          const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i out_00_4 =
-              _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_5 =
-              _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_4 =
-              _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_16_5 =
-              _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_4 =
-              _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_08_5 =
-              _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_4 =
-              _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_24_5 =
-              _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
-          const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
-          const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
-          const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
-          const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
-          const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
-          const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
-          const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
-          // Combine
-          out[0] = _mm_packs_epi32(out_00_6, out_00_7);
-          out[16] = _mm_packs_epi32(out_16_6, out_16_7);
-          out[8] = _mm_packs_epi32(out_08_6, out_08_7);
-          out[24] = _mm_packs_epi32(out_24_6, out_24_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
-          const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
-          const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
-          const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
-          const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
-          const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
-          const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
-          const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
-          const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
-          const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
-          const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
-          const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
-          const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
-          const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
-          const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
-          const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
-          const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
-          const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
-          const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
-          // Combine
-          step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
-          step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
-          step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
-          step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&step2[9], &step2[10], &step2[13],
-                                             &step2[14]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step2[16] = ADD_EPI16(step1[19], step3[16]);
-          step2[17] = ADD_EPI16(step1[18], step3[17]);
-          step2[18] = SUB_EPI16(step3[17], step1[18]);
-          step2[19] = SUB_EPI16(step3[16], step1[19]);
-          step2[20] = SUB_EPI16(step3[23], step1[20]);
-          step2[21] = SUB_EPI16(step3[22], step1[21]);
-          step2[22] = ADD_EPI16(step1[21], step3[22]);
-          step2[23] = ADD_EPI16(step1[20], step3[23]);
-          step2[24] = ADD_EPI16(step1[27], step3[24]);
-          step2[25] = ADD_EPI16(step1[26], step3[25]);
-          step2[26] = SUB_EPI16(step3[25], step1[26]);
-          step2[27] = SUB_EPI16(step3[24], step1[27]);
-          step2[28] = SUB_EPI16(step3[31], step1[28]);
-          step2[29] = SUB_EPI16(step3[30], step1[29]);
-          step2[30] = ADD_EPI16(step1[29], step3[30]);
-          step2[31] = ADD_EPI16(step1[28], step3[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step2[16], &step2[17], &step2[18], &step2[19], &step2[20],
-              &step2[21], &step2[22], &step2[23], &step2[24], &step2[25],
-              &step2[26], &step2[27], &step2[28], &step2[29], &step2[30],
-              &step2[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 6
-        {
-          const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
-          const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
-          const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
-          const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
-          const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
-          const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
-          const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
-          const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
-          const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
-          const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
-          const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
-          const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
-          // dct_const_round_shift
-          const __m128i out_04_4 =
-              _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_5 =
-              _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_4 =
-              _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_20_5 =
-              _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_4 =
-              _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_12_5 =
-              _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_4 =
-              _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_28_5 =
-              _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
-          const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
-          const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
-          const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
-          const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
-          const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
-          const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
-          const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
-          // Combine
-          out[4] = _mm_packs_epi32(out_04_6, out_04_7);
-          out[20] = _mm_packs_epi32(out_20_6, out_20_7);
-          out[12] = _mm_packs_epi32(out_12_6, out_12_7);
-          out[28] = _mm_packs_epi32(out_28_6, out_28_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step3[8] = ADD_EPI16(step2[9], step1[8]);
-          step3[9] = SUB_EPI16(step1[8], step2[9]);
-          step3[10] = SUB_EPI16(step1[11], step2[10]);
-          step3[11] = ADD_EPI16(step2[10], step1[11]);
-          step3[12] = ADD_EPI16(step2[13], step1[12]);
-          step3[13] = SUB_EPI16(step1[12], step2[13]);
-          step3[14] = SUB_EPI16(step1[15], step2[14]);
-          step3[15] = ADD_EPI16(step2[14], step1[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
-                                             &step3[11], &step3[12], &step3[13],
-                                             &step3[14], &step3[15]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
-          const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
-          const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
-          const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
-          const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
-          const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
-          const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
-          const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
-          const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
-          const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
-          const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
-          const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
-          const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
-          const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
-          const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
-          const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
-          const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
-          const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
-          const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
-          const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
-          const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
-          const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
-          const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
-          const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
-          // dct_const_round_shift
-          const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
-          const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
-          const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
-          const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
-          const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
-          const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
-          const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
-          const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
-          const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
-          const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
-          const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
-          const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
-          const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
-          const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
-          const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
-          const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
-          // Combine
-          step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
-          step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
-          step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
-          step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
-          // Combine
-          step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
-          step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
-          step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
-          step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
-                                             &step3[22], &step3[25], &step3[26],
-                                             &step3[29], &step3[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Stage 7
-        {
-          const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
-          const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
-          const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
-          const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
-          const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
-          const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
-          const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
-          const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
-          const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
-          const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
-          const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
-          const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
-          const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
-          const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
-          const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
-          const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
-          const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
-          const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
-          const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
-          const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
-          const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
-          const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
-          const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
-          const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
-          // dct_const_round_shift
-          const __m128i out_02_4 =
-              _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_5 =
-              _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_4 =
-              _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_18_5 =
-              _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_4 =
-              _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_10_5 =
-              _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_4 =
-              _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_26_5 =
-              _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_4 =
-              _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_06_5 =
-              _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_4 =
-              _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_22_5 =
-              _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_4 =
-              _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_14_5 =
-              _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_4 =
-              _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_30_5 =
-              _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
-          const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
-          const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
-          const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
-          const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
-          const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
-          const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
-          const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
-          const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
-          const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
-          const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
-          const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
-          const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
-          const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
-          const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
-          const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
-          // Combine
-          out[2] = _mm_packs_epi32(out_02_6, out_02_7);
-          out[18] = _mm_packs_epi32(out_18_6, out_18_7);
-          out[10] = _mm_packs_epi32(out_10_6, out_10_7);
-          out[26] = _mm_packs_epi32(out_26_6, out_26_7);
-          out[6] = _mm_packs_epi32(out_06_6, out_06_7);
-          out[22] = _mm_packs_epi32(out_22_6, out_22_7);
-          out[14] = _mm_packs_epi32(out_14_6, out_14_7);
-          out[30] = _mm_packs_epi32(out_30_6, out_30_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          step1[16] = ADD_EPI16(step3[17], step2[16]);
-          step1[17] = SUB_EPI16(step2[16], step3[17]);
-          step1[18] = SUB_EPI16(step2[19], step3[18]);
-          step1[19] = ADD_EPI16(step3[18], step2[19]);
-          step1[20] = ADD_EPI16(step3[21], step2[20]);
-          step1[21] = SUB_EPI16(step2[20], step3[21]);
-          step1[22] = SUB_EPI16(step2[23], step3[22]);
-          step1[23] = ADD_EPI16(step3[22], step2[23]);
-          step1[24] = ADD_EPI16(step3[25], step2[24]);
-          step1[25] = SUB_EPI16(step2[24], step3[25]);
-          step1[26] = SUB_EPI16(step2[27], step3[26]);
-          step1[27] = ADD_EPI16(step3[26], step2[27]);
-          step1[28] = ADD_EPI16(step3[29], step2[28]);
-          step1[29] = SUB_EPI16(step2[28], step3[29]);
-          step1[30] = SUB_EPI16(step2[31], step3[30]);
-          step1[31] = ADD_EPI16(step3[30], step2[31]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x16(
-              &step1[16], &step1[17], &step1[18], &step1[19], &step1[20],
-              &step1[21], &step1[22], &step1[23], &step1[24], &step1[25],
-              &step1[26], &step1[27], &step1[28], &step1[29], &step1[30],
-              &step1[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // Final stage --- outputs indices are bit-reversed.
-        {
-          const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
-          const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
-          const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
-          const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
-          const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
-          const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
-          const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
-          const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
-          const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
-          const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
-          const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
-          const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
-          const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
-          const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
-          const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
-          const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
-          const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
-          const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
-          const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
-          const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
-          const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
-          const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
-          const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
-          const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
-          // dct_const_round_shift
-          const __m128i out_01_4 =
-              _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_5 =
-              _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_4 =
-              _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_17_5 =
-              _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_4 =
-              _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_09_5 =
-              _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_4 =
-              _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_25_5 =
-              _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_4 =
-              _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_07_5 =
-              _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_4 =
-              _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_23_5 =
-              _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_4 =
-              _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_15_5 =
-              _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_4 =
-              _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_31_5 =
-              _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
-          const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
-          const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
-          const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
-          const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
-          const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
-          const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
-          const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
-          const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
-          const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
-          const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
-          const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
-          const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
-          const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
-          const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
-          const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
-          // Combine
-          out[1] = _mm_packs_epi32(out_01_6, out_01_7);
-          out[17] = _mm_packs_epi32(out_17_6, out_17_7);
-          out[9] = _mm_packs_epi32(out_09_6, out_09_7);
-          out[25] = _mm_packs_epi32(out_25_6, out_25_7);
-          out[7] = _mm_packs_epi32(out_07_6, out_07_7);
-          out[23] = _mm_packs_epi32(out_23_6, out_23_7);
-          out[15] = _mm_packs_epi32(out_15_6, out_15_7);
-          out[31] = _mm_packs_epi32(out_31_6, out_31_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
-          const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
-          const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
-          const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
-          const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
-          const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
-          const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
-          const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
-          const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
-          const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
-          const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
-          const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
-          const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
-          const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
-          const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
-          const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
-          const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
-          const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
-          const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
-          const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
-          const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
-          const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
-          const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
-          const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
-          // dct_const_round_shift
-          const __m128i out_05_4 =
-              _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_5 =
-              _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_4 =
-              _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_21_5 =
-              _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_4 =
-              _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_13_5 =
-              _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_4 =
-              _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_29_5 =
-              _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_4 =
-              _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_03_5 =
-              _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_4 =
-              _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_19_5 =
-              _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_4 =
-              _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_11_5 =
-              _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_4 =
-              _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
-          const __m128i out_27_5 =
-              _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
-          const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
-          const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
-          const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
-          const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
-          const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
-          const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
-          const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
-          const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
-          const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
-          const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
-          const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
-          const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
-          const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
-          const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
-          const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
-          const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
-          // Combine
-          out[5] = _mm_packs_epi32(out_05_6, out_05_7);
-          out[21] = _mm_packs_epi32(out_21_6, out_21_7);
-          out[13] = _mm_packs_epi32(out_13_6, out_13_7);
-          out[29] = _mm_packs_epi32(out_29_6, out_29_7);
-          out[3] = _mm_packs_epi32(out_03_6, out_03_7);
-          out[19] = _mm_packs_epi32(out_19_6, out_19_7);
-          out[11] = _mm_packs_epi32(out_11_6, out_11_7);
-          out[27] = _mm_packs_epi32(out_27_6, out_27_7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            if (pass == 0)
-              HIGH_FDCT32x32_2D_C(input, output_org, stride);
-            else
-              HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-#if FDCT32x32_HIGH_PRECISION
-      } else {
-        __m128i lstep1[64], lstep2[64], lstep3[64];
-        __m128i u[32], v[32], sign[16];
-        const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
-        // start using 32-bit operations
-        // stage 3
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
-        }
-        {
-          const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
-          const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
-          const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
-          const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
-          const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
-          const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
-          const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
-          const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
-          const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
-          const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
-          const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
-          const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
-          const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
-          lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
-          lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
-          lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
-          lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
-          lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
-          lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
-          lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
-          lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
-        }
-        {
-          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
-        }
-
-        // stage 4
-        {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
-
-          lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
-          lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
-          lstep1[2] = _mm_add_epi32(lstep3[4], lstep3[2]);
-          lstep1[3] = _mm_add_epi32(lstep3[5], lstep3[3]);
-          lstep1[4] = _mm_sub_epi32(lstep3[2], lstep3[4]);
-          lstep1[5] = _mm_sub_epi32(lstep3[3], lstep3[5]);
-          lstep1[6] = _mm_sub_epi32(lstep3[0], lstep3[6]);
-          lstep1[7] = _mm_sub_epi32(lstep3[1], lstep3[7]);
-          lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
-          lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
-          lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
-          lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
-          lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
-          lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
-          lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
-          lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
-          lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
-          lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
-          lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
-          lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
-          lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
-          lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
-          lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
-          lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
-        }
-        {
-          // to be continued...
-          //
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
-          u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
-          u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
-          u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_m16);
-          v[1] = k_madd_epi32(u[1], k32_p16_m16);
-          v[2] = k_madd_epi32(u[2], k32_p16_m16);
-          v[3] = k_madd_epi32(u[3], k32_p16_m16);
-          v[4] = k_madd_epi32(u[0], k32_p16_p16);
-          v[5] = k_madd_epi32(u[1], k32_p16_p16);
-          v[6] = k_madd_epi32(u[2], k32_p16_p16);
-          v[7] = k_madd_epi32(u[3], k32_p16_p16);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3], &v[4],
-                                              &v[5], &v[6], &v[7], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-
-          lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
-          u[1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
-          u[2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
-          u[3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
-          u[4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
-          u[5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
-          u[6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
-          u[7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
-          u[8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
-          u[9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
-          u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
-          u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
-          u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
-          u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
-          u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
-          u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m08_p24);
-          v[5] = k_madd_epi32(u[5], k32_m08_p24);
-          v[6] = k_madd_epi32(u[6], k32_m08_p24);
-          v[7] = k_madd_epi32(u[7], k32_m08_p24);
-          v[8] = k_madd_epi32(u[8], k32_m24_m08);
-          v[9] = k_madd_epi32(u[9], k32_m24_m08);
-          v[10] = k_madd_epi32(u[10], k32_m24_m08);
-          v[11] = k_madd_epi32(u[11], k32_m24_m08);
-          v[12] = k_madd_epi32(u[12], k32_m24_m08);
-          v[13] = k_madd_epi32(u[13], k32_m24_m08);
-          v[14] = k_madd_epi32(u[14], k32_m24_m08);
-          v[15] = k_madd_epi32(u[15], k32_m24_m08);
-          v[16] = k_madd_epi32(u[12], k32_m08_p24);
-          v[17] = k_madd_epi32(u[13], k32_m08_p24);
-          v[18] = k_madd_epi32(u[14], k32_m08_p24);
-          v[19] = k_madd_epi32(u[15], k32_m08_p24);
-          v[20] = k_madd_epi32(u[8], k32_m08_p24);
-          v[21] = k_madd_epi32(u[9], k32_m08_p24);
-          v[22] = k_madd_epi32(u[10], k32_m08_p24);
-          v[23] = k_madd_epi32(u[11], k32_m08_p24);
-          v[24] = k_madd_epi32(u[4], k32_p24_p08);
-          v[25] = k_madd_epi32(u[5], k32_p24_p08);
-          v[26] = k_madd_epi32(u[6], k32_p24_p08);
-          v[27] = k_madd_epi32(u[7], k32_p24_p08);
-          v[28] = k_madd_epi32(u[0], k32_p24_p08);
-          v[29] = k_madd_epi32(u[1], k32_p24_p08);
-          v[30] = k_madd_epi32(u[2], k32_p24_p08);
-          v[31] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep1[36] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep1[37] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep1[38] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep1[39] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep1[40] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep1[41] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep1[42] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep1[43] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep1[52] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep1[53] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 5
-        {
-          lstep2[8] = _mm_add_epi32(lstep1[10], lstep3[8]);
-          lstep2[9] = _mm_add_epi32(lstep1[11], lstep3[9]);
-          lstep2[10] = _mm_sub_epi32(lstep3[8], lstep1[10]);
-          lstep2[11] = _mm_sub_epi32(lstep3[9], lstep1[11]);
-          lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
-          lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
-          lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
-          lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
-        }
-        {
-          const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
-          const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
-          u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
-          u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
-          u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
-          u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
-          u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
-          u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
-          u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
-
-          // TODO(jingning): manually inline k_madd_epi32_ to further hide
-          // instruction latency.
-          v[0] = k_madd_epi32(u[0], k32_p16_p16);
-          v[1] = k_madd_epi32(u[1], k32_p16_p16);
-          v[2] = k_madd_epi32(u[2], k32_p16_p16);
-          v[3] = k_madd_epi32(u[3], k32_p16_p16);
-          v[4] = k_madd_epi32(u[0], k32_p16_m16);
-          v[5] = k_madd_epi32(u[1], k32_p16_m16);
-          v[6] = k_madd_epi32(u[2], k32_p16_m16);
-          v[7] = k_madd_epi32(u[3], k32_p16_m16);
-          v[8] = k_madd_epi32(u[4], k32_p24_p08);
-          v[9] = k_madd_epi32(u[5], k32_p24_p08);
-          v[10] = k_madd_epi32(u[6], k32_p24_p08);
-          v[11] = k_madd_epi32(u[7], k32_p24_p08);
-          v[12] = k_madd_epi32(u[4], k32_m08_p24);
-          v[13] = k_madd_epi32(u[5], k32_m08_p24);
-          v[14] = k_madd_epi32(u[6], k32_m08_p24);
-          v[15] = k_madd_epi32(u[7], k32_m08_p24);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          // Combine
-          out[0] = _mm_packs_epi32(u[0], u[1]);
-          out[16] = _mm_packs_epi32(u[2], u[3]);
-          out[8] = _mm_packs_epi32(u[4], u[5]);
-          out[24] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[0], &out[16], &out[8], &out[24]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
-          const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
-          const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
-          u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
-          u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
-          u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
-          u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
-          u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
-          u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
-          u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
-
-          v[0] = k_madd_epi32(u[0], k32_m08_p24);
-          v[1] = k_madd_epi32(u[1], k32_m08_p24);
-          v[2] = k_madd_epi32(u[2], k32_m08_p24);
-          v[3] = k_madd_epi32(u[3], k32_m08_p24);
-          v[4] = k_madd_epi32(u[4], k32_m24_m08);
-          v[5] = k_madd_epi32(u[5], k32_m24_m08);
-          v[6] = k_madd_epi32(u[6], k32_m24_m08);
-          v[7] = k_madd_epi32(u[7], k32_m24_m08);
-          v[8] = k_madd_epi32(u[4], k32_m08_p24);
-          v[9] = k_madd_epi32(u[5], k32_m08_p24);
-          v[10] = k_madd_epi32(u[6], k32_m08_p24);
-          v[11] = k_madd_epi32(u[7], k32_m08_p24);
-          v[12] = k_madd_epi32(u[0], k32_p24_p08);
-          v[13] = k_madd_epi32(u[1], k32_p24_p08);
-          v[14] = k_madd_epi32(u[2], k32_p24_p08);
-          v[15] = k_madd_epi32(u[3], k32_p24_p08);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-          lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-          lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-          lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-          lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-          lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-          lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-          lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-        }
-        {
-          lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
-          lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
-          lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
-          lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
-          lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
-          lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
-          lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
-          lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
-          lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
-          lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
-          lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
-          lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
-          lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
-          lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
-          lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
-          lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
-          lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
-          lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
-          lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
-          lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
-          lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
-          lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
-          lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
-          lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
-          lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
-          lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
-          lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
-          lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
-          lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
-          lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
-          lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
-          lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
-        }
-        // stage 6
-        {
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[1] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[2] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[3] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-          u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
-          u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
-          u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
-          u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
-          u[12] = _mm_unpacklo_epi32(lstep2[8], lstep2[14]);
-          u[13] = _mm_unpackhi_epi32(lstep2[8], lstep2[14]);
-          u[14] = _mm_unpacklo_epi32(lstep2[9], lstep2[15]);
-          u[15] = _mm_unpackhi_epi32(lstep2[9], lstep2[15]);
-
-          v[0] = k_madd_epi32(u[0], k32_p28_p04);
-          v[1] = k_madd_epi32(u[1], k32_p28_p04);
-          v[2] = k_madd_epi32(u[2], k32_p28_p04);
-          v[3] = k_madd_epi32(u[3], k32_p28_p04);
-          v[4] = k_madd_epi32(u[4], k32_p12_p20);
-          v[5] = k_madd_epi32(u[5], k32_p12_p20);
-          v[6] = k_madd_epi32(u[6], k32_p12_p20);
-          v[7] = k_madd_epi32(u[7], k32_p12_p20);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m04_p28);
-          v[13] = k_madd_epi32(u[13], k32_m04_p28);
-          v[14] = k_madd_epi32(u[14], k32_m04_p28);
-          v[15] = k_madd_epi32(u[15], k32_m04_p28);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_16(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
-          sign[0] = _mm_cmplt_epi32(u[0], kZero);
-          sign[1] = _mm_cmplt_epi32(u[1], kZero);
-          sign[2] = _mm_cmplt_epi32(u[2], kZero);
-          sign[3] = _mm_cmplt_epi32(u[3], kZero);
-          sign[4] = _mm_cmplt_epi32(u[4], kZero);
-          sign[5] = _mm_cmplt_epi32(u[5], kZero);
-          sign[6] = _mm_cmplt_epi32(u[6], kZero);
-          sign[7] = _mm_cmplt_epi32(u[7], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], sign[0]);
-          u[1] = _mm_sub_epi32(u[1], sign[1]);
-          u[2] = _mm_sub_epi32(u[2], sign[2]);
-          u[3] = _mm_sub_epi32(u[3], sign[3]);
-          u[4] = _mm_sub_epi32(u[4], sign[4]);
-          u[5] = _mm_sub_epi32(u[5], sign[5]);
-          u[6] = _mm_sub_epi32(u[6], sign[6]);
-          u[7] = _mm_sub_epi32(u[7], sign[7]);
-
-          u[0] = _mm_add_epi32(u[0], K32One);
-          u[1] = _mm_add_epi32(u[1], K32One);
-          u[2] = _mm_add_epi32(u[2], K32One);
-          u[3] = _mm_add_epi32(u[3], K32One);
-          u[4] = _mm_add_epi32(u[4], K32One);
-          u[5] = _mm_add_epi32(u[5], K32One);
-          u[6] = _mm_add_epi32(u[6], K32One);
-          u[7] = _mm_add_epi32(u[7], K32One);
-
-          u[0] = _mm_srai_epi32(u[0], 2);
-          u[1] = _mm_srai_epi32(u[1], 2);
-          u[2] = _mm_srai_epi32(u[2], 2);
-          u[3] = _mm_srai_epi32(u[3], 2);
-          u[4] = _mm_srai_epi32(u[4], 2);
-          u[5] = _mm_srai_epi32(u[5], 2);
-          u[6] = _mm_srai_epi32(u[6], 2);
-          u[7] = _mm_srai_epi32(u[7], 2);
-
-          out[4] = _mm_packs_epi32(u[0], u[1]);
-          out[20] = _mm_packs_epi32(u[2], u[3]);
-          out[12] = _mm_packs_epi32(u[4], u[5]);
-          out[28] = _mm_packs_epi32(u[6], u[7]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&out[4], &out[20], &out[12], &out[28]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
-          lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
-          lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
-          lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
-          lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
-          lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
-          lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
-          lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
-          lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
-          lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
-          lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
-          lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
-          lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
-          lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
-          lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
-          lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
-        }
-        {
-          const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
-          const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
-          const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
-          const __m128i k32_m12_m20 =
-              pair_set_epi32(-cospi_12_64, -cospi_20_64);
-          const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
-          const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
-          u[1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
-          u[2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
-          u[3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
-          u[4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
-          u[5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
-          u[6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
-          u[7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
-          u[8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
-          u[9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
-          u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
-          u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
-          u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
-          u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
-          u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
-          u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
-
-          v[0] = k_madd_epi32(u[0], k32_m04_p28);
-          v[1] = k_madd_epi32(u[1], k32_m04_p28);
-          v[2] = k_madd_epi32(u[2], k32_m04_p28);
-          v[3] = k_madd_epi32(u[3], k32_m04_p28);
-          v[4] = k_madd_epi32(u[4], k32_m28_m04);
-          v[5] = k_madd_epi32(u[5], k32_m28_m04);
-          v[6] = k_madd_epi32(u[6], k32_m28_m04);
-          v[7] = k_madd_epi32(u[7], k32_m28_m04);
-          v[8] = k_madd_epi32(u[8], k32_m20_p12);
-          v[9] = k_madd_epi32(u[9], k32_m20_p12);
-          v[10] = k_madd_epi32(u[10], k32_m20_p12);
-          v[11] = k_madd_epi32(u[11], k32_m20_p12);
-          v[12] = k_madd_epi32(u[12], k32_m12_m20);
-          v[13] = k_madd_epi32(u[13], k32_m12_m20);
-          v[14] = k_madd_epi32(u[14], k32_m12_m20);
-          v[15] = k_madd_epi32(u[15], k32_m12_m20);
-          v[16] = k_madd_epi32(u[12], k32_m20_p12);
-          v[17] = k_madd_epi32(u[13], k32_m20_p12);
-          v[18] = k_madd_epi32(u[14], k32_m20_p12);
-          v[19] = k_madd_epi32(u[15], k32_m20_p12);
-          v[20] = k_madd_epi32(u[8], k32_p12_p20);
-          v[21] = k_madd_epi32(u[9], k32_p12_p20);
-          v[22] = k_madd_epi32(u[10], k32_p12_p20);
-          v[23] = k_madd_epi32(u[11], k32_p12_p20);
-          v[24] = k_madd_epi32(u[4], k32_m04_p28);
-          v[25] = k_madd_epi32(u[5], k32_m04_p28);
-          v[26] = k_madd_epi32(u[6], k32_m04_p28);
-          v[27] = k_madd_epi32(u[7], k32_m04_p28);
-          v[28] = k_madd_epi32(u[0], k32_p28_p04);
-          v[29] = k_madd_epi32(u[1], k32_p28_p04);
-          v[30] = k_madd_epi32(u[2], k32_p28_p04);
-          v[31] = k_madd_epi32(u[3], k32_p28_p04);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          lstep3[34] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          lstep3[35] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          lstep3[36] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          lstep3[37] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          lstep3[42] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          lstep3[43] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          lstep3[44] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          lstep3[45] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          lstep3[50] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          lstep3[51] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-        }
-        // stage 7
-        {
-          const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
-          const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
-          const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
-          const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
-          const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
-          const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
-          const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
-          const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
-          u[1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
-          u[2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
-          u[3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
-          u[4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
-          u[5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
-          u[6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
-          u[7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
-          u[8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
-          u[9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
-          u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
-          u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
-          u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
-          u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
-          u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
-          u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
-
-          v[0] = k_madd_epi32(u[0], k32_p30_p02);
-          v[1] = k_madd_epi32(u[1], k32_p30_p02);
-          v[2] = k_madd_epi32(u[2], k32_p30_p02);
-          v[3] = k_madd_epi32(u[3], k32_p30_p02);
-          v[4] = k_madd_epi32(u[4], k32_p14_p18);
-          v[5] = k_madd_epi32(u[5], k32_p14_p18);
-          v[6] = k_madd_epi32(u[6], k32_p14_p18);
-          v[7] = k_madd_epi32(u[7], k32_p14_p18);
-          v[8] = k_madd_epi32(u[8], k32_p22_p10);
-          v[9] = k_madd_epi32(u[9], k32_p22_p10);
-          v[10] = k_madd_epi32(u[10], k32_p22_p10);
-          v[11] = k_madd_epi32(u[11], k32_p22_p10);
-          v[12] = k_madd_epi32(u[12], k32_p06_p26);
-          v[13] = k_madd_epi32(u[13], k32_p06_p26);
-          v[14] = k_madd_epi32(u[14], k32_p06_p26);
-          v[15] = k_madd_epi32(u[15], k32_p06_p26);
-          v[16] = k_madd_epi32(u[12], k32_m26_p06);
-          v[17] = k_madd_epi32(u[13], k32_m26_p06);
-          v[18] = k_madd_epi32(u[14], k32_m26_p06);
-          v[19] = k_madd_epi32(u[15], k32_m26_p06);
-          v[20] = k_madd_epi32(u[8], k32_m10_p22);
-          v[21] = k_madd_epi32(u[9], k32_m10_p22);
-          v[22] = k_madd_epi32(u[10], k32_m10_p22);
-          v[23] = k_madd_epi32(u[11], k32_m10_p22);
-          v[24] = k_madd_epi32(u[4], k32_m18_p14);
-          v[25] = k_madd_epi32(u[5], k32_m18_p14);
-          v[26] = k_madd_epi32(u[6], k32_m18_p14);
-          v[27] = k_madd_epi32(u[7], k32_m18_p14);
-          v[28] = k_madd_epi32(u[0], k32_m02_p30);
-          v[29] = k_madd_epi32(u[1], k32_m02_p30);
-          v[30] = k_madd_epi32(u[2], k32_m02_p30);
-          v[31] = k_madd_epi32(u[3], k32_m02_p30);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[2] = _mm_packs_epi32(u[0], u[1]);
-          out[18] = _mm_packs_epi32(u[2], u[3]);
-          out[10] = _mm_packs_epi32(u[4], u[5]);
-          out[26] = _mm_packs_epi32(u[6], u[7]);
-          out[6] = _mm_packs_epi32(u[8], u[9]);
-          out[22] = _mm_packs_epi32(u[10], u[11]);
-          out[14] = _mm_packs_epi32(u[12], u[13]);
-          out[30] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[2], &out[18], &out[10], &out[26],
-                                      &out[6], &out[22], &out[14], &out[30]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
-          lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
-          lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
-          lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
-          lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
-          lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
-          lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
-          lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
-          lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
-          lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
-          lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
-          lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
-          lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
-          lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
-          lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
-          lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
-          lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
-          lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
-          lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
-          lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
-          lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
-          lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
-          lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
-          lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
-          lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
-          lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
-          lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
-          lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
-          lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
-          lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
-          lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
-          lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
-        }
-        // stage 8
-        {
-          const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
-          const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
-          const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
-          const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
-          const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
-          const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
-          const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
-          const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
-          u[1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
-          u[2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
-          u[3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
-          u[4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
-          u[5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
-          u[6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
-          u[7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
-          u[8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
-          u[9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
-          u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
-          u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
-          u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
-          u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
-          u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
-          u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
-
-          v[0] = k_madd_epi32(u[0], k32_p31_p01);
-          v[1] = k_madd_epi32(u[1], k32_p31_p01);
-          v[2] = k_madd_epi32(u[2], k32_p31_p01);
-          v[3] = k_madd_epi32(u[3], k32_p31_p01);
-          v[4] = k_madd_epi32(u[4], k32_p15_p17);
-          v[5] = k_madd_epi32(u[5], k32_p15_p17);
-          v[6] = k_madd_epi32(u[6], k32_p15_p17);
-          v[7] = k_madd_epi32(u[7], k32_p15_p17);
-          v[8] = k_madd_epi32(u[8], k32_p23_p09);
-          v[9] = k_madd_epi32(u[9], k32_p23_p09);
-          v[10] = k_madd_epi32(u[10], k32_p23_p09);
-          v[11] = k_madd_epi32(u[11], k32_p23_p09);
-          v[12] = k_madd_epi32(u[12], k32_p07_p25);
-          v[13] = k_madd_epi32(u[13], k32_p07_p25);
-          v[14] = k_madd_epi32(u[14], k32_p07_p25);
-          v[15] = k_madd_epi32(u[15], k32_p07_p25);
-          v[16] = k_madd_epi32(u[12], k32_m25_p07);
-          v[17] = k_madd_epi32(u[13], k32_m25_p07);
-          v[18] = k_madd_epi32(u[14], k32_m25_p07);
-          v[19] = k_madd_epi32(u[15], k32_m25_p07);
-          v[20] = k_madd_epi32(u[8], k32_m09_p23);
-          v[21] = k_madd_epi32(u[9], k32_m09_p23);
-          v[22] = k_madd_epi32(u[10], k32_m09_p23);
-          v[23] = k_madd_epi32(u[11], k32_m09_p23);
-          v[24] = k_madd_epi32(u[4], k32_m17_p15);
-          v[25] = k_madd_epi32(u[5], k32_m17_p15);
-          v[26] = k_madd_epi32(u[6], k32_m17_p15);
-          v[27] = k_madd_epi32(u[7], k32_m17_p15);
-          v[28] = k_madd_epi32(u[0], k32_m01_p31);
-          v[29] = k_madd_epi32(u[1], k32_m01_p31);
-          v[30] = k_madd_epi32(u[2], k32_m01_p31);
-          v[31] = k_madd_epi32(u[3], k32_m01_p31);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[1] = _mm_packs_epi32(u[0], u[1]);
-          out[17] = _mm_packs_epi32(u[2], u[3]);
-          out[9] = _mm_packs_epi32(u[4], u[5]);
-          out[25] = _mm_packs_epi32(u[6], u[7]);
-          out[7] = _mm_packs_epi32(u[8], u[9]);
-          out[23] = _mm_packs_epi32(u[10], u[11]);
-          out[15] = _mm_packs_epi32(u[12], u[13]);
-          out[31] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[1], &out[17], &out[9], &out[25],
-                                      &out[7], &out[23], &out[15], &out[31]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
-          const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
-          const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
-          const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
-          const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
-          const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
-          const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
-          const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
-
-          u[0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
-          u[1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
-          u[2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
-          u[3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
-          u[4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
-          u[5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
-          u[6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
-          u[7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
-          u[8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
-          u[9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
-          u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
-          u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
-          u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
-          u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
-          u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
-          u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
-
-          v[0] = k_madd_epi32(u[0], k32_p27_p05);
-          v[1] = k_madd_epi32(u[1], k32_p27_p05);
-          v[2] = k_madd_epi32(u[2], k32_p27_p05);
-          v[3] = k_madd_epi32(u[3], k32_p27_p05);
-          v[4] = k_madd_epi32(u[4], k32_p11_p21);
-          v[5] = k_madd_epi32(u[5], k32_p11_p21);
-          v[6] = k_madd_epi32(u[6], k32_p11_p21);
-          v[7] = k_madd_epi32(u[7], k32_p11_p21);
-          v[8] = k_madd_epi32(u[8], k32_p19_p13);
-          v[9] = k_madd_epi32(u[9], k32_p19_p13);
-          v[10] = k_madd_epi32(u[10], k32_p19_p13);
-          v[11] = k_madd_epi32(u[11], k32_p19_p13);
-          v[12] = k_madd_epi32(u[12], k32_p03_p29);
-          v[13] = k_madd_epi32(u[13], k32_p03_p29);
-          v[14] = k_madd_epi32(u[14], k32_p03_p29);
-          v[15] = k_madd_epi32(u[15], k32_p03_p29);
-          v[16] = k_madd_epi32(u[12], k32_m29_p03);
-          v[17] = k_madd_epi32(u[13], k32_m29_p03);
-          v[18] = k_madd_epi32(u[14], k32_m29_p03);
-          v[19] = k_madd_epi32(u[15], k32_m29_p03);
-          v[20] = k_madd_epi32(u[8], k32_m13_p19);
-          v[21] = k_madd_epi32(u[9], k32_m13_p19);
-          v[22] = k_madd_epi32(u[10], k32_m13_p19);
-          v[23] = k_madd_epi32(u[11], k32_m13_p19);
-          v[24] = k_madd_epi32(u[4], k32_m21_p11);
-          v[25] = k_madd_epi32(u[5], k32_m21_p11);
-          v[26] = k_madd_epi32(u[6], k32_m21_p11);
-          v[27] = k_madd_epi32(u[7], k32_m21_p11);
-          v[28] = k_madd_epi32(u[0], k32_m05_p27);
-          v[29] = k_madd_epi32(u[1], k32_m05_p27);
-          v[30] = k_madd_epi32(u[2], k32_m05_p27);
-          v[31] = k_madd_epi32(u[3], k32_m05_p27);
-
-#if DCT_HIGH_BIT_DEPTH
-          overflow = k_check_epi32_overflow_32(
-              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7], &v[8],
-              &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15], &v[16],
-              &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23], &v[24],
-              &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31], &kZero);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          u[0] = k_packs_epi64(v[0], v[1]);
-          u[1] = k_packs_epi64(v[2], v[3]);
-          u[2] = k_packs_epi64(v[4], v[5]);
-          u[3] = k_packs_epi64(v[6], v[7]);
-          u[4] = k_packs_epi64(v[8], v[9]);
-          u[5] = k_packs_epi64(v[10], v[11]);
-          u[6] = k_packs_epi64(v[12], v[13]);
-          u[7] = k_packs_epi64(v[14], v[15]);
-          u[8] = k_packs_epi64(v[16], v[17]);
-          u[9] = k_packs_epi64(v[18], v[19]);
-          u[10] = k_packs_epi64(v[20], v[21]);
-          u[11] = k_packs_epi64(v[22], v[23]);
-          u[12] = k_packs_epi64(v[24], v[25]);
-          u[13] = k_packs_epi64(v[26], v[27]);
-          u[14] = k_packs_epi64(v[28], v[29]);
-          u[15] = k_packs_epi64(v[30], v[31]);
-
-          v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-          v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-          v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-          v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-          v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-          v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-          v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-          v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-          v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-          v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-          v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-          v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-          v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-          v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-          v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-          v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-          u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-          u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-          u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-          u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-          u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-          u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-          u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-          u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-          u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-          u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-          u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-          u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-          u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-          u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-          u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-          u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-          v[0] = _mm_cmplt_epi32(u[0], kZero);
-          v[1] = _mm_cmplt_epi32(u[1], kZero);
-          v[2] = _mm_cmplt_epi32(u[2], kZero);
-          v[3] = _mm_cmplt_epi32(u[3], kZero);
-          v[4] = _mm_cmplt_epi32(u[4], kZero);
-          v[5] = _mm_cmplt_epi32(u[5], kZero);
-          v[6] = _mm_cmplt_epi32(u[6], kZero);
-          v[7] = _mm_cmplt_epi32(u[7], kZero);
-          v[8] = _mm_cmplt_epi32(u[8], kZero);
-          v[9] = _mm_cmplt_epi32(u[9], kZero);
-          v[10] = _mm_cmplt_epi32(u[10], kZero);
-          v[11] = _mm_cmplt_epi32(u[11], kZero);
-          v[12] = _mm_cmplt_epi32(u[12], kZero);
-          v[13] = _mm_cmplt_epi32(u[13], kZero);
-          v[14] = _mm_cmplt_epi32(u[14], kZero);
-          v[15] = _mm_cmplt_epi32(u[15], kZero);
-
-          u[0] = _mm_sub_epi32(u[0], v[0]);
-          u[1] = _mm_sub_epi32(u[1], v[1]);
-          u[2] = _mm_sub_epi32(u[2], v[2]);
-          u[3] = _mm_sub_epi32(u[3], v[3]);
-          u[4] = _mm_sub_epi32(u[4], v[4]);
-          u[5] = _mm_sub_epi32(u[5], v[5]);
-          u[6] = _mm_sub_epi32(u[6], v[6]);
-          u[7] = _mm_sub_epi32(u[7], v[7]);
-          u[8] = _mm_sub_epi32(u[8], v[8]);
-          u[9] = _mm_sub_epi32(u[9], v[9]);
-          u[10] = _mm_sub_epi32(u[10], v[10]);
-          u[11] = _mm_sub_epi32(u[11], v[11]);
-          u[12] = _mm_sub_epi32(u[12], v[12]);
-          u[13] = _mm_sub_epi32(u[13], v[13]);
-          u[14] = _mm_sub_epi32(u[14], v[14]);
-          u[15] = _mm_sub_epi32(u[15], v[15]);
-
-          v[0] = _mm_add_epi32(u[0], K32One);
-          v[1] = _mm_add_epi32(u[1], K32One);
-          v[2] = _mm_add_epi32(u[2], K32One);
-          v[3] = _mm_add_epi32(u[3], K32One);
-          v[4] = _mm_add_epi32(u[4], K32One);
-          v[5] = _mm_add_epi32(u[5], K32One);
-          v[6] = _mm_add_epi32(u[6], K32One);
-          v[7] = _mm_add_epi32(u[7], K32One);
-          v[8] = _mm_add_epi32(u[8], K32One);
-          v[9] = _mm_add_epi32(u[9], K32One);
-          v[10] = _mm_add_epi32(u[10], K32One);
-          v[11] = _mm_add_epi32(u[11], K32One);
-          v[12] = _mm_add_epi32(u[12], K32One);
-          v[13] = _mm_add_epi32(u[13], K32One);
-          v[14] = _mm_add_epi32(u[14], K32One);
-          v[15] = _mm_add_epi32(u[15], K32One);
-
-          u[0] = _mm_srai_epi32(v[0], 2);
-          u[1] = _mm_srai_epi32(v[1], 2);
-          u[2] = _mm_srai_epi32(v[2], 2);
-          u[3] = _mm_srai_epi32(v[3], 2);
-          u[4] = _mm_srai_epi32(v[4], 2);
-          u[5] = _mm_srai_epi32(v[5], 2);
-          u[6] = _mm_srai_epi32(v[6], 2);
-          u[7] = _mm_srai_epi32(v[7], 2);
-          u[8] = _mm_srai_epi32(v[8], 2);
-          u[9] = _mm_srai_epi32(v[9], 2);
-          u[10] = _mm_srai_epi32(v[10], 2);
-          u[11] = _mm_srai_epi32(v[11], 2);
-          u[12] = _mm_srai_epi32(v[12], 2);
-          u[13] = _mm_srai_epi32(v[13], 2);
-          u[14] = _mm_srai_epi32(v[14], 2);
-          u[15] = _mm_srai_epi32(v[15], 2);
-
-          out[5] = _mm_packs_epi32(u[0], u[1]);
-          out[21] = _mm_packs_epi32(u[2], u[3]);
-          out[13] = _mm_packs_epi32(u[4], u[5]);
-          out[29] = _mm_packs_epi32(u[6], u[7]);
-          out[3] = _mm_packs_epi32(u[8], u[9]);
-          out[19] = _mm_packs_epi32(u[10], u[11]);
-          out[11] = _mm_packs_epi32(u[12], u[13]);
-          out[27] = _mm_packs_epi32(u[14], u[15]);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&out[5], &out[21], &out[13], &out[29],
-                                      &out[3], &out[19], &out[11], &out[27]);
-          if (overflow) {
-            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-#endif  // FDCT32x32_HIGH_PRECISION
-      // Transpose the results, do it as four 8x8 transposes.
-      {
-        int transpose_block;
-        int16_t *output0 = &intermediate[column_start * 32];
-        tran_low_t *output1 = &output_org[column_start * 32];
-        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
-          __m128i *this_out = &out[8 * transpose_block];
-          // 00 01 02 03 04 05 06 07
-          // 10 11 12 13 14 15 16 17
-          // 20 21 22 23 24 25 26 27
-          // 30 31 32 33 34 35 36 37
-          // 40 41 42 43 44 45 46 47
-          // 50 51 52 53 54 55 56 57
-          // 60 61 62 63 64 65 66 67
-          // 70 71 72 73 74 75 76 77
-          const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
-          const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
-          const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
-          const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
-          const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
-          // 00 10 01 11 02 12 03 13
-          // 20 30 21 31 22 32 23 33
-          // 04 14 05 15 06 16 07 17
-          // 24 34 25 35 26 36 27 37
-          // 40 50 41 51 42 52 43 53
-          // 60 70 61 71 62 72 63 73
-          // 54 54 55 55 56 56 57 57
-          // 64 74 65 75 66 76 67 77
-          const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-          const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-          const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-          const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-          const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-          const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-          const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-          const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-          // 00 10 20 30 01 11 21 31
-          // 40 50 60 70 41 51 61 71
-          // 02 12 22 32 03 13 23 33
-          // 42 52 62 72 43 53 63 73
-          // 04 14 24 34 05 15 21 36
-          // 44 54 64 74 45 55 61 76
-          // 06 16 26 36 07 17 27 37
-          // 46 56 66 76 47 57 67 77
-          __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-          __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-          __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-          __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-          __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-          __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-          __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-          __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-          // 00 10 20 30 40 50 60 70
-          // 01 11 21 31 41 51 61 71
-          // 02 12 22 32 42 52 62 72
-          // 03 13 23 33 43 53 63 73
-          // 04 14 24 34 44 54 64 74
-          // 05 15 25 35 45 55 65 75
-          // 06 16 26 36 46 56 66 76
-          // 07 17 27 37 47 57 67 77
-          if (0 == pass) {
-            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
-            // TODO(cd): see quality impact of only doing
-            //           output[j] = (output[j] + 1) >> 2;
-            //           which would remove the code between here ...
-            __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
-            __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
-            __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
-            __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
-            __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
-            __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
-            __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
-            __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
-            tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
-            tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
-            tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
-            tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
-            tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
-            tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
-            tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
-            tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
-            //           ... and here.
-            //           PS: also change code in av1/encoder/dct.c
-            tr2_0 = _mm_add_epi16(tr2_0, kOne);
-            tr2_1 = _mm_add_epi16(tr2_1, kOne);
-            tr2_2 = _mm_add_epi16(tr2_2, kOne);
-            tr2_3 = _mm_add_epi16(tr2_3, kOne);
-            tr2_4 = _mm_add_epi16(tr2_4, kOne);
-            tr2_5 = _mm_add_epi16(tr2_5, kOne);
-            tr2_6 = _mm_add_epi16(tr2_6, kOne);
-            tr2_7 = _mm_add_epi16(tr2_7, kOne);
-            tr2_0 = _mm_srai_epi16(tr2_0, 2);
-            tr2_1 = _mm_srai_epi16(tr2_1, 2);
-            tr2_2 = _mm_srai_epi16(tr2_2, 2);
-            tr2_3 = _mm_srai_epi16(tr2_3, 2);
-            tr2_4 = _mm_srai_epi16(tr2_4, 2);
-            tr2_5 = _mm_srai_epi16(tr2_5, 2);
-            tr2_6 = _mm_srai_epi16(tr2_6, 2);
-            tr2_7 = _mm_srai_epi16(tr2_7, 2);
-          }
-          // Note: even though all these stores are aligned, using the aligned
-          //       intrinsic make the code slightly slower.
-          if (pass == 0) {
-            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
-            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
-            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
-            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
-            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
-            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
-            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
-            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
-            // Process next 8x8
-            output0 += 8;
-          } else {
-            storeu_output(&tr2_0, (output1 + 0 * 32));
-            storeu_output(&tr2_1, (output1 + 1 * 32));
-            storeu_output(&tr2_2, (output1 + 2 * 32));
-            storeu_output(&tr2_3, (output1 + 3 * 32));
-            storeu_output(&tr2_4, (output1 + 4 * 32));
-            storeu_output(&tr2_5, (output1 + 5 * 32));
-            storeu_output(&tr2_6, (output1 + 6 * 32));
-            storeu_output(&tr2_7, (output1 + 7 * 32));
-            // Process next 8x8
-            output1 += 8;
-          }
-        }
-      }
-    }
-  }
-}  // NOLINT
-
-#undef ADD_EPI16
-#undef SUB_EPI16
-#undef HIGH_FDCT32x32_2D_C
-#undef HIGH_FDCT32x32_2D_ROWS_C

diff --git a/av1/common/x86/av1_fwd_txfm1d_sse4.c b/av1/common/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 0000000..c09a019
--- /dev/null
+++ b/av1/common/x86/av1_fwd_txfm1d_sse4.c

@@ -0,0 +1,976 @@
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+    buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+    buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+    buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+    buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+    buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+    buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+    buf1[8] = buf0[8];
+    buf1[9] = buf0[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+    buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+    buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+    buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+    buf0[4] = buf1[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[7] = buf1[7];
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+    buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+    buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+    buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+    buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+    buf1[8] = buf0[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    buf1[11] = buf0[11];
+    buf1[12] = buf0[12];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+    buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+    buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+    buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+    buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+    buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+    buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+    buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+    buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+    buf0[16] = buf1[16];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[27] = buf1[27];
+    buf0[28] = buf1[28];
+    buf0[31] = buf1[31];
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+                        buf1[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+    buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+    buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+    buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                        buf0[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[16];
+    buf1[2] = buf0[8];
+    buf1[3] = buf0[24];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[20];
+    buf1[6] = buf0[12];
+    buf1[7] = buf0[28];
+    buf1[8] = buf0[2];
+    buf1[9] = buf0[18];
+    buf1[10] = buf0[10];
+    buf1[11] = buf0[26];
+    buf1[12] = buf0[6];
+    buf1[13] = buf0[22];
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[30];
+    buf1[16] = buf0[1];
+    buf1[17] = buf0[17];
+    buf1[18] = buf0[9];
+    buf1[19] = buf0[25];
+    buf1[20] = buf0[5];
+    buf1[21] = buf0[21];
+    buf1[22] = buf0[13];
+    buf1[23] = buf0[29];
+    buf1[24] = buf0[3];
+    buf1[25] = buf0[19];
+    buf1[26] = buf0[11];
+    buf1[27] = buf0[27];
+    buf1[28] = buf0[7];
+    buf1[29] = buf0[23];
+    buf1[30] = buf0[15];
+    buf1[31] = buf0[31];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+  }
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+  }
+}
+
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t *cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[31];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[29];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[27];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[25];
+    buf1[7] = buf0[6];
+    buf1[8] = buf0[23];
+    buf1[9] = buf0[8];
+    buf1[10] = buf0[21];
+    buf1[11] = buf0[10];
+    buf1[12] = buf0[19];
+    buf1[13] = buf0[12];
+    buf1[14] = buf0[17];
+    buf1[15] = buf0[14];
+    buf1[16] = buf0[15];
+    buf1[17] = buf0[16];
+    buf1[18] = buf0[13];
+    buf1[19] = buf0[18];
+    buf1[20] = buf0[11];
+    buf1[21] = buf0[20];
+    buf1[22] = buf0[9];
+    buf1[23] = buf0[22];
+    buf1[24] = buf0[7];
+    buf1[25] = buf0[24];
+    buf1[26] = buf0[5];
+    buf1[27] = buf0[26];
+    buf1[28] = buf0[3];
+    buf1[29] = buf0[28];
+    buf1[30] = buf0[1];
+    buf1[31] = buf0[30];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+                        bit);
+    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+                        bit);
+    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
+    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[26] = buf1[26];
+    buf0[27] = buf1[27];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
+
+    // stage 10
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 11
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
+    buf1[2] = buf0[24];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+    buf1[4] = buf0[12];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
+    buf1[6] = buf0[20];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[8] = buf0[6];
+    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
+    buf1[10] = buf0[30];
+    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+    buf1[12] = buf0[10];
+    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
+    buf1[14] = buf0[18];
+    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[16] = buf0[3];
+    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
+    buf1[18] = buf0[27];
+    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+    buf1[20] = buf0[15];
+    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
+    buf1[22] = buf0[23];
+    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[24] = buf0[5];
+    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
+    buf1[26] = buf0[29];
+    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+    buf1[28] = buf0[9];
+    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
+    buf1[30] = buf0[17];
+    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+  }
+}

diff --git a/av1/common/x86/av1_fwd_txfm2d_sse4.c b/av1/common/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 0000000..3d60b36
--- /dev/null
+++ b/av1/common/x86/av1_fwd_txfm2d_sse4.c

@@ -0,0 +1,80 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./av1_rtcd.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm1d_sse4.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT32: return av1_fdct32_new_sse4_1; break;
+    case TXFM_TYPE_ADST32: return av1_fadst32_new_sse4_1; break;
+    default: assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride, const TXFM_2D_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  const int txfm_size = cfg->txfm_size;
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, int tx_type, int bd) {
+  int32_t txfm_buf[1024];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_cfg(tx_type, TX_32X32);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                 int stride, int tx_type, int bd) {
+  int32_t txfm_buf[4096];
+  TXFM_2D_FLIP_CFG cfg = av1_get_fwd_txfm_64x64_cfg(tx_type);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}

diff --git a/av1/common/x86/av1_fwd_txfm_impl_sse2.h b/av1/common/x86/av1_fwd_txfm_impl_sse2.h
deleted file mode 100644
index 0e341ac..0000000
--- a/av1/common/x86/av1_fwd_txfm_impl_sse2.h
+++ /dev/null

@@ -1,1014 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-// TODO(jingning) The high bit-depth functions need rework for performance.
-// After we properly fix the high bit-depth function implementations, this
-// file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
-void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // This 2D transform implements 4 vertical 1D transforms followed
-  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
-  // by Chen, Smith and Fralick ('77).  The commands for moving the data
-  // around have been minimized by hand.
-  // For the purposes of the comments, the 16 inputs are referred to at i0
-  // through iF (in raster order), intermediate variables are a0, b0, c0
-  // through f, and correspond to the in-place computations mapped to input
-  // locations.  The outputs, o0 through oF are labeled according to the
-  // output locations.
-
-  // Constants
-  // These are the coefficients used for the multiplies.
-  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
-  // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E =
-      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
-                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F =
-      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
-                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G =
-      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
-                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H =
-      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
-                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 =
-      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i in0, in1;
-#if DCT_HIGH_BIT_DEPTH
-  __m128i cmp0, cmp1;
-  int test, overflow;
-#endif
-
-  // Load inputs.
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-// in0 = [i0 i1 i2 i3 iC iD iE iF]
-// in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-#if DCT_HIGH_BIT_DEPTH
-  // Check inputs small enough to use optimised code
-  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
-  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
-  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
-  if (test) {
-    aom_highbd_fdct4x4_c(input, output, stride);
-    return;
-  }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-  // multiply by 16 to give some extra precision
-  in0 = _mm_slli_epi16(in0, 4);
-  in1 = _mm_slli_epi16(in1, 4);
-  // if (i == 0 && input[0]) input[0] += 1;
-  // add 1 to the upper left pixel if it is non-zero, which helps reduce
-  // the round-trip error
-  {
-    // The mask will only contain whether the first value is zero, all
-    // other comparison will fail as something shifted by 4 (above << 4)
-    // can never be equal to one. To increment in the non-zero case, we
-    // add the mask and one for the first element:
-    //   - if zero, mask = -1, v = v - 1 + 1 = v
-    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-    in0 = _mm_add_epi16(in0, mask);
-    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-  }
-  // There are 4 total stages, alternating between an add/subtract stage
-  // followed by an multiply-and-add stage.
-  {
-    // Stage 1: Add/subtract
-
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
-    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
-    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
-    // r1 = [iC i8 iD i9 iE iA iF iB]
-    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
-    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
-    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
-    // r3 = [iC i8 iD i9 iF iB iE iA]
-
-    const __m128i t0 = _mm_add_epi16(r2, r3);
-    const __m128i t1 = _mm_sub_epi16(r2, r3);
-    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
-    // t1 = [aC a8 aD a9 aF aB aE aA]
-
-    // Stage 2: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
-    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
-    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
-    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
-    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
-    // Then add and right-shift to get back to 16-bit range
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-    // w0 = [b0 b1 b7 b6]
-    // w1 = [b8 b9 bF bE]
-    // w2 = [b4 b5 b3 b2]
-    // w3 = [bC bD bB bA]
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&x0, &x1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
-    // x1 = [b4 b5 b3 b2 bC bD bB bA]
-    in0 = _mm_shuffle_epi32(x0, 0xD8);
-    in1 = _mm_shuffle_epi32(x1, 0x8D);
-    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
-    // in1 = [b3 b2 bB bA b4 b5 bC bD]
-  }
-  {
-    // vertical DCTs finished. Now we do the horizontal DCTs.
-    // Stage 3: Add/subtract
-
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-    const __m128i t0 = ADD_EPI16(in0, in1);
-    const __m128i t1 = SUB_EPI16(in0, in1);
-#if DCT_HIGH_BIT_DEPTH
-    overflow = check_epi16_overflow_x2(&t0, &t1);
-    if (overflow) {
-      aom_highbd_fdct4x4_c(input, output, stride);
-      return;
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-    // Stage 4: multiply by constants (which gets us into 32 bits).
-    {
-      // The constants needed here are:
-      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
-      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
-      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
-      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
-      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
-      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
-      // Then add and right-shift to get back to 16-bit range
-      // but this combines the final right-shift as well to save operations
-      // This unusual rounding operations is to maintain bit-accurate
-      // compatibility with the c version of this function which has two
-      // rounding steps in a row.
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
-      // w0 = [o0 o4 o8 oC]
-      // w1 = [o2 o6 oA oE]
-      // w2 = [o1 o5 o9 oD]
-      // w3 = [o3 o7 oB oF]
-      // remember the o's are numbered according to the correct output location
-      const __m128i x0 = _mm_packs_epi32(w0, w1);
-      const __m128i x1 = _mm_packs_epi32(w2, w3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&x0, &x1);
-      if (overflow) {
-        aom_highbd_fdct4x4_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
-        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
-        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
-        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
-        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
-        // y1 = [o2 o3 o6 o7 oA oB oE oF]
-        in0 = _mm_unpacklo_epi32(y0, y1);
-        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
-        in1 = _mm_unpackhi_epi32(y0, y1);
-        // in1 = [o8 o9 oA oB oC oD oE oF]
-      }
-    }
-  }
-  // Post-condition (v + 1) >> 2 is now incorporated into previous
-  // add and right-shift commands.  Only 2 store instructions needed
-  // because we are using the fact that 1/3 are stored just after 0/2.
-  storeu_output(&in0, output + 0 * 4);
-  storeu_output(&in1, output + 2 * 4);
-}
-
-void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = ADD_EPI16(in0, in7);
-    const __m128i q1 = ADD_EPI16(in1, in6);
-    const __m128i q2 = ADD_EPI16(in2, in5);
-    const __m128i q3 = ADD_EPI16(in3, in4);
-    const __m128i q4 = SUB_EPI16(in3, in4);
-    const __m128i q5 = SUB_EPI16(in2, in5);
-    const __m128i q6 = SUB_EPI16(in1, in6);
-    const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
-    if (pass == 1) {
-      overflow =
-          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = ADD_EPI16(q0, q3);
-      const __m128i r1 = ADD_EPI16(q1, q2);
-      const __m128i r2 = SUB_EPI16(q1, q2);
-      const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      // Interleave to do the multiply by constants which gets us into 32bits
-      {
-        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-        // dct_const_round_shift
-        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-        // Combine
-        res0 = _mm_packs_epi32(w0, w1);
-        res4 = _mm_packs_epi32(w2, w3);
-        res2 = _mm_packs_epi32(w4, w5);
-        res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&r0, &r1);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // Add/subtract
-        const __m128i x0 = ADD_EPI16(q4, r0);
-        const __m128i x1 = SUB_EPI16(q4, r0);
-        const __m128i x2 = SUB_EPI16(q7, r1);
-        const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Interleave to do the multiply by constants which gets us into 32bits
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res1 = _mm_packs_epi32(w0, w1);
-          res7 = _mm_packs_epi32(w2, w3);
-          res5 = _mm_packs_epi32(w4, w5);
-          res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
-          if (overflow) {
-            aom_highbd_fdct8x8_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-    // store results
-    store_output(&in0, (output + 0 * 8));
-    store_output(&in1, (output + 1 * 8));
-    store_output(&in2, (output + 2 * 8));
-    store_output(&in3, (output + 3 * 8));
-    store_output(&in4, (output + 4 * 8));
-    store_output(&in5, (output + 5 * 8));
-    store_output(&in6, (output + 6 * 8));
-    store_output(&in7, (output + 7 * 8));
-  }
-}
-
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
-  const int16_t *in = input;
-  int16_t *out0 = intermediate;
-  tran_low_t *out1 = output;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-#if DCT_HIGH_BIT_DEPTH
-    int overflow;
-#endif
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = ADD_EPI16(in00, in15);
-        input1 = ADD_EPI16(in01, in14);
-        input2 = ADD_EPI16(in02, in13);
-        input3 = ADD_EPI16(in03, in12);
-        input4 = ADD_EPI16(in04, in11);
-        input5 = ADD_EPI16(in05, in10);
-        input6 = ADD_EPI16(in06, in09);
-        input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
-                                           &input4, &input5, &input6, &input7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = SUB_EPI16(in07, in08);
-        step1_1 = SUB_EPI16(in06, in09);
-        step1_2 = SUB_EPI16(in05, in10);
-        step1_3 = SUB_EPI16(in04, in11);
-        step1_4 = SUB_EPI16(in03, in12);
-        step1_5 = SUB_EPI16(in02, in13);
-        step1_6 = SUB_EPI16(in01, in14);
-        step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                    &step1_4, &step1_5, &step1_6, &step1_7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = ADD_EPI16(input0, input7);
-        const __m128i q1 = ADD_EPI16(input1, input6);
-        const __m128i q2 = ADD_EPI16(input2, input5);
-        const __m128i q3 = ADD_EPI16(input3, input4);
-        const __m128i q4 = SUB_EPI16(input3, input4);
-        const __m128i q5 = SUB_EPI16(input2, input5);
-        const __m128i q6 = SUB_EPI16(input1, input6);
-        const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = ADD_EPI16(q0, q3);
-          const __m128i r1 = ADD_EPI16(q1, q2);
-          const __m128i r2 = SUB_EPI16(q1, q2);
-          const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          {
-            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-          }
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          const __m128i r1 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&r0, &r1);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          {
-            // Add/subtract
-            const __m128i x0 = ADD_EPI16(q4, r0);
-            const __m128i x1 = SUB_EPI16(q4, r0);
-            const __m128i x2 = SUB_EPI16(q7, r1);
-            const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-            // Interleave to do the multiply by constants which gets us
-            // into 32 bits.
-            {
-              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-              overflow =
-                  check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
-              if (overflow) {
-                aom_highbd_fdct16x16_c(input, output, stride);
-                return;
-              }
-#endif  // DCT_HIGH_BIT_DEPTH
-            }
-          }
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 3
-        {
-          step3_0 = ADD_EPI16(step1_0, step2_3);
-          step3_1 = ADD_EPI16(step1_1, step2_2);
-          step3_2 = SUB_EPI16(step1_1, step2_2);
-          step3_3 = SUB_EPI16(step1_0, step2_3);
-          step3_4 = SUB_EPI16(step1_7, step2_4);
-          step3_5 = SUB_EPI16(step1_6, step2_5);
-          step3_6 = ADD_EPI16(step1_6, step2_5);
-          step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
-                                      &step3_4, &step3_5, &step3_6, &step3_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 5
-        {
-          step1_0 = ADD_EPI16(step3_0, step2_1);
-          step1_1 = SUB_EPI16(step3_0, step2_1);
-          step1_2 = ADD_EPI16(step3_3, step2_2);
-          step1_3 = SUB_EPI16(step3_3, step2_2);
-          step1_4 = SUB_EPI16(step3_4, step2_5);
-          step1_5 = ADD_EPI16(step3_4, step2_5);
-          step1_6 = SUB_EPI16(step3_7, step2_6);
-          step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                      &step1_4, &step1_5, &step1_6, &step1_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
-                              &res06, &res07, pass, out0, out1);
-      transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
-                              &res14, &res15, pass, out0 + 8, out1 + 8);
-      if (pass == 0) {
-        out0 += 8 * 16;
-      } else {
-        out1 += 8 * 16;
-      }
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-  }
-}
-
-#undef ADD_EPI16
-#undef SUB_EPI16

diff --git a/av1/common/x86/av1_fwd_txfm_sse2.c b/av1/common/x86/av1_fwd_txfm_sse2.c
deleted file mode 100644
index 081fe08..0000000
--- a/av1/common/x86/av1_fwd_txfm_sse2.c
+++ /dev/null

@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./aom_config.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-
-void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0, in1;
-  __m128i tmp;
-  const __m128i zero = _mm_setzero_si128();
-  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in1 = _mm_unpacklo_epi64(
-      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
-  in0 = _mm_unpacklo_epi64(
-      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
-
-  tmp = _mm_add_epi16(in0, in1);
-  in0 = _mm_unpacklo_epi16(zero, tmp);
-  in1 = _mm_unpackhi_epi16(zero, tmp);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(tmp, zero);
-  in1 = _mm_unpackhi_epi32(tmp, zero);
-
-  tmp = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(tmp, 8);
-
-  in1 = _mm_add_epi32(tmp, in0);
-  in0 = _mm_slli_epi32(in1, 1);
-  store_output(&in0, output);
-}
-
-void av1_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i u0, u1, sum;
-
-  u0 = _mm_add_epi16(in0, in1);
-  u1 = _mm_add_epi16(in2, in3);
-
-  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-  sum = _mm_add_epi16(u0, u1);
-
-  in0 = _mm_add_epi16(in0, in1);
-  in2 = _mm_add_epi16(in2, in3);
-  sum = _mm_add_epi16(sum, in0);
-
-  u0 = _mm_setzero_si128();
-  sum = _mm_add_epi16(sum, in2);
-
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  store_output(&in1, output);
-}
-
-void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  __m128i in0, in1, in2, in3;
-  __m128i u0, u1;
-  __m128i sum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 2; ++i) {
-    input += 8 * i;
-    in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
-    in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
-    in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
-    in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
-
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    sum = _mm_add_epi16(sum, u1);
-  }
-
-  u0 = _mm_setzero_si128();
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  in1 = _mm_srai_epi32(in1, 1);
-  store_output(&in1, output);
-}
-
-void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
-                          int stride) {
-  __m128i in0, in1, in2, in3;
-  __m128i u0, u1;
-  __m128i sum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 8; ++i) {
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    in0 = _mm_load_si128((const __m128i *)(input + 0));
-    in1 = _mm_load_si128((const __m128i *)(input + 8));
-    in2 = _mm_load_si128((const __m128i *)(input + 16));
-    in3 = _mm_load_si128((const __m128i *)(input + 24));
-
-    input += stride;
-    sum = _mm_add_epi16(sum, u1);
-    u0 = _mm_add_epi16(in0, in1);
-    u1 = _mm_add_epi16(in2, in3);
-    sum = _mm_add_epi16(sum, u0);
-
-    sum = _mm_add_epi16(sum, u1);
-  }
-
-  u0 = _mm_setzero_si128();
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  in1 = _mm_srai_epi32(in1, 3);
-  store_output(&in1, output);
-}
-
-#define DCT_HIGH_BIT_DEPTH 0
-#define FDCT4x4_2D av1_fdct4x4_sse2
-#define FDCT8x8_2D av1_fdct8x8_sse2
-#define FDCT16x16_2D av1_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-
-#if CONFIG_AOM_HIGHBITDEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT4x4_2D av1_highbd_fdct4x4_sse2
-#define FDCT8x8_2D av1_highbd_fdct8x8_sse2
-#define FDCT16x16_2D av1_highbd_fdct16x16_sse2
-#include "av1/common/x86/av1_fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT4x4_2D
-#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D av1_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "av1/common/x86/av1_fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
-#endif  // CONFIG_AOM_HIGHBITDEPTH

diff --git a/av1/common/x86/av1_highbd_convolve_filters_sse4.h b/av1/common/x86/av1_highbd_convolve_filters_sse4.h
index 1fb5023..be85657 100644
--- a/av1/common/x86/av1_highbd_convolve_filters_sse4.h
+++ b/av1/common/x86/av1_highbd_convolve_filters_sse4.h

@@ -8,315 +8,135 @@
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
+
 #ifndef AV1_COMMON_X86_AV1_HIGHBD_CONVOLVE_FILTERS_SSE4_H_
 #define AV1_COMMON_X86_AV1_HIGHBD_CONVOLVE_FILTERS_SSE4_H_
 
 #include "./aom_config.h"
-#include "av1/common/filter.h"
 
 #if CONFIG_AOM_HIGHBITDEPTH
 #if CONFIG_EXT_INTERP
 DECLARE_ALIGNED(16, static const int16_t,
                 sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8]) = {
   {
-      {
-          0, 0, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -6, 127, -6, 127, -6, 127, -6, 127,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1,
-      },
-      {
-          0, 0, 0, 0, 0, 0, 0, 0,
-      },
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 127, -6, 127, -6, 127, -6, 127 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -2, 5, -2, 5, -2, 5, -2, 5,
-      },
-      {
-          -12, 124, -12, 124, -12, 124, -12, 124,
-      },
-      {
-          18, -7, 18, -7, 18, -7, 18, -7,
-      },
-      {
-          3, -2, 3, -2, 3, -2, 3, -2,
-      },
-      {
-          0, 0, 0, 0, 0, 0, 0, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 5, -2, 5, -2, 5, -2, 5 },
+      { -12, 124, -12, 124, -12, 124, -12, 124 },
+      { 18, -7, 18, -7, 18, -7, 18, -7 },
+      { 3, -2, 3, -2, 3, -2, 3, -2 },
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -3, 7, -3, 7, -3, 7, -3, 7,
-      },
-      {
-          -17, 119, -17, 119, -17, 119, -17, 119,
-      },
-      {
-          28, -11, 28, -11, 28, -11, 28, -11,
-      },
-      {
-          5, -2, 5, -2, 5, -2, 5, -2,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -3, 7, -3, 7, -3, 7, -3, 7 },
+      { -17, 119, -17, 119, -17, 119, -17, 119 },
+      { 28, -11, 28, -11, 28, -11, 28, -11 },
+      { 5, -2, 5, -2, 5, -2, 5, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          -20, 114, -20, 114, -20, 114, -20, 114,
-      },
-      {
-          38, -14, 38, -14, 38, -14, 38, -14,
-      },
-      {
-          7, -3, 7, -3, 7, -3, 7, -3,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -20, 114, -20, 114, -20, 114, -20, 114 },
+      { 38, -14, 38, -14, 38, -14, 38, -14 },
+      { 7, -3, 7, -3, 7, -3, 7, -3 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -4, 9, -4, 9, -4, 9, -4, 9,
-      },
-      {
-          -22, 107, -22, 107, -22, 107, -22, 107,
-      },
-      {
-          49, -17, 49, -17, 49, -17, 49, -17,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -4, 9, -4, 9, -4, 9, -4, 9 },
+      { -22, 107, -22, 107, -22, 107, -22, 107 },
+      { 49, -17, 49, -17, 49, -17, 49, -17 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -24, 99, -24, 99, -24, 99, -24, 99,
-      },
-      {
-          59, -20, 59, -20, 59, -20, 59, -20,
-      },
-      {
-          9, -4, 9, -4, 9, -4, 9, -4,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -24, 99, -24, 99, -24, 99, -24, 99 },
+      { 59, -20, 59, -20, 59, -20, 59, -20 },
+      { 9, -4, 9, -4, 9, -4, 9, -4 },
+      { 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -24, 90, -24, 90, -24, 90, -24, 90,
-      },
-      {
-          70, -22, 70, -22, 70, -22, 70, -22,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -24, 90, -24, 90, -24, 90, -24, 90 },
+      { 70, -22, 70, -22, 70, -22, 70, -22 },
+      { 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -23, 80, -23, 80, -23, 80, -23, 80,
-      },
-      {
-          80, -23, 80, -23, 80, -23, 80, -23,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -23, 80, -23, 80, -23, 80, -23, 80 },
+      { 80, -23, 80, -23, 80, -23, 80, -23 },
+      { 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -5, 10, -5, 10, -5, 10, -5, 10,
-      },
-      {
-          -22, 70, -22, 70, -22, 70, -22, 70,
-      },
-      {
-          90, -24, 90, -24, 90, -24, 90, -24,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -5, 10, -5, 10, -5, 10, -5, 10 },
+      { -22, 70, -22, 70, -22, 70, -22, 70 },
+      { 90, -24, 90, -24, 90, -24, 90, -24 },
+      { 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 2, 0, 2, 0, 2, 0, 2,
-      },
-      {
-          -4, 9, -4, 9, -4, 9, -4, 9,
-      },
-      {
-          -20, 59, -20, 59, -20, 59, -20, 59,
-      },
-      {
-          99, -24, 99, -24, 99, -24, 99, -24,
-      },
-      {
-          10, -5, 10, -5, 10, -5, 10, -5,
-      },
-      {
-          2, 0, 2, 0, 2, 0, 2, 0,
-      },
+      { 0, 2, 0, 2, 0, 2, 0, 2 },
+      { -4, 9, -4, 9, -4, 9, -4, 9 },
+      { -20, 59, -20, 59, -20, 59, -20, 59 },
+      { 99, -24, 99, -24, 99, -24, 99, -24 },
+      { 10, -5, 10, -5, 10, -5, 10, -5 },
+      { 2, 0, 2, 0, 2, 0, 2, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          -17, 49, -17, 49, -17, 49, -17, 49,
-      },
-      {
-          107, -22, 107, -22, 107, -22, 107, -22,
-      },
-      {
-          9, -4, 9, -4, 9, -4, 9, -4,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -17, 49, -17, 49, -17, 49, -17, 49 },
+      { 107, -22, 107, -22, 107, -22, 107, -22 },
+      { 9, -4, 9, -4, 9, -4, 9, -4 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -3, 7, -3, 7, -3, 7, -3, 7,
-      },
-      {
-          -14, 38, -14, 38, -14, 38, -14, 38,
-      },
-      {
-          114, -20, 114, -20, 114, -20, 114, -20,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -3, 7, -3, 7, -3, 7, -3, 7 },
+      { -14, 38, -14, 38, -14, 38, -14, 38 },
+      { 114, -20, 114, -20, 114, -20, 114, -20 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -2, 5, -2, 5, -2, 5, -2, 5,
-      },
-      {
-          -11, 28, -11, 28, -11, 28, -11, 28,
-      },
-      {
-          119, -17, 119, -17, 119, -17, 119, -17,
-      },
-      {
-          7, -3, 7, -3, 7, -3, 7, -3,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 5, -2, 5, -2, 5, -2, 5 },
+      { -11, 28, -11, 28, -11, 28, -11, 28 },
+      { 119, -17, 119, -17, 119, -17, 119, -17 },
+      { 7, -3, 7, -3, 7, -3, 7, -3 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 0, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          -2, 3, -2, 3, -2, 3, -2, 3,
-      },
-      {
-          -7, 18, -7, 18, -7, 18, -7, 18,
-      },
-      {
-          124, -12, 124, -12, 124, -12, 124, -12,
-      },
-      {
-          5, -2, 5, -2, 5, -2, 5, -2,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -2, 3, -2, 3, -2, 3, -2, 3 },
+      { -7, 18, -7, 18, -7, 18, -7, 18 },
+      { 124, -12, 124, -12, 124, -12, 124, -12 },
+      { 5, -2, 5, -2, 5, -2, 5, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          0, 0, 0, 0, 0, 0, 0, 0,
-      },
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          127, -6, 127, -6, 127, -6, 127, -6,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1,
-      },
-      {
-          0, 0, 0, 0, 0, 0, 0, 0,
-      },
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { 127, -6, 127, -6, 127, -6, 127, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
   },
 };
 #endif
@@ -326,304 +146,252 @@
 DECLARE_ALIGNED(16, static const int16_t,
                 sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8]) = {
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -2, 3, -2, 3, -2, 3, -2, 3,
-      },
-      {
-          -7, 127, -7, 127, -7, 127, -7, 127,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 3, -2, 3, -2, 3, -2, 3 },
+      { -7, 127, -7, 127, -7, 127, -7, 127 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
   {
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -3, 6, -3, 6, -3, 6, -3, 6,
-      },
-      {
-          -13, 124, -13, 124, -13, 124, -13, 124,
-      },
-      {
-          18, -8, 18, -8, 18, -8, 18, -8,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1,
-      },
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -3, 6, -3, 6, -3, 6, -3, 6 },
+      { -13, 124, -13, 124, -13, 124, -13, 124 },
+      { 18, -8, 18, -8, 18, -8, 18, -8 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
   },
   {
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          -18, 120, -18, 120, -18, 120, -18, 120,
-      },
-      {
-          28, -12, 28, -12, 28, -12, 28, -12,
-      },
-      {
-          7, -4, 7, -4, 7, -4, 7, -4,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1,
-      },
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -18, 120, -18, 120, -18, 120, -18, 120 },
+      { 28, -12, 28, -12, 28, -12, 28, -12 },
+      { 7, -4, 7, -4, 7, -4, 7, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
   },
   {
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -6, 10, -6, 10, -6, 10, -6, 10,
-      },
-      {
-          -21, 115, -21, 115, -21, 115, -21, 115,
-      },
-      {
-          38, -15, 38, -15, 38, -15, 38, -15,
-      },
-      {
-          8, -5, 8, -5, 8, -5, 8, -5,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1,
-      },
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 10, -6, 10, -6, 10, -6, 10 },
+      { -21, 115, -21, 115, -21, 115, -21, 115 },
+      { 38, -15, 38, -15, 38, -15, 38, -15 },
+      { 8, -5, 8, -5, 8, -5, 8, -5 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -6, 12, -6, 12, -6, 12, -6, 12,
-      },
-      {
-          -24, 108, -24, 108, -24, 108, -24, 108,
-      },
-      {
-          49, -18, 49, -18, 49, -18, 49, -18,
-      },
-      {
-          10, -6, 10, -6, 10, -6, 10, -6,
-      },
-      {
-          3, -2, 3, -2, 3, -2, 3, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -24, 108, -24, 108, -24, 108, -24, 108 },
+      { 49, -18, 49, -18, 49, -18, 49, -18 },
+      { 10, -6, 10, -6, 10, -6, 10, -6 },
+      { 3, -2, 3, -2, 3, -2, 3, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -25, 100, -25, 100, -25, 100, -25, 100,
-      },
-      {
-          60, -21, 60, -21, 60, -21, 60, -21,
-      },
-      {
-          11, -7, 11, -7, 11, -7, 11, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -25, 100, -25, 100, -25, 100, -25, 100 },
+      { 60, -21, 60, -21, 60, -21, 60, -21 },
+      { 11, -7, 11, -7, 11, -7, 11, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -26, 91, -26, 91, -26, 91, -26, 91,
-      },
-      {
-          71, -24, 71, -24, 71, -24, 71, -24,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -26, 91, -26, 91, -26, 91, -26, 91 },
+      { 71, -24, 71, -24, 71, -24, 71, -24 },
+      { 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -25, 81, -25, 81, -25, 81, -25, 81,
-      },
-      {
-          81, -25, 81, -25, 81, -25, 81, -25,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -25, 81, -25, 81, -25, 81, -25, 81 },
+      { 81, -25, 81, -25, 81, -25, 81, -25 },
+      { 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 13, -7, 13, -7, 13, -7, 13,
-      },
-      {
-          -24, 71, -24, 71, -24, 71, -24, 71,
-      },
-      {
-          91, -26, 91, -26, 91, -26, 91, -26,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 13, -7, 13, -7, 13, -7, 13 },
+      { -24, 71, -24, 71, -24, 71, -24, 71 },
+      { 91, -26, 91, -26, 91, -26, 91, -26 },
+      { 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -7, 11, -7, 11, -7, 11, -7, 11,
-      },
-      {
-          -21, 60, -21, 60, -21, 60, -21, 60,
-      },
-      {
-          100, -25, 100, -25, 100, -25, 100, -25,
-      },
-      {
-          13, -7, 13, -7, 13, -7, 13, -7,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -7, 11, -7, 11, -7, 11, -7, 11 },
+      { -21, 60, -21, 60, -21, 60, -21, 60 },
+      { 100, -25, 100, -25, 100, -25, 100, -25 },
+      { 13, -7, 13, -7, 13, -7, 13, -7 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -2, 3, -2, 3, -2, 3, -2, 3,
-      },
-      {
-          -6, 10, -6, 10, -6, 10, -6, 10,
-      },
-      {
-          -18, 49, -18, 49, -18, 49, -18, 49,
-      },
-      {
-          108, -24, 108, -24, 108, -24, 108, -24,
-      },
-      {
-          12, -6, 12, -6, 12, -6, 12, -6,
-      },
-      {
-          4, -2, 4, -2, 4, -2, 4, -2,
-      },
+      { -2, 3, -2, 3, -2, 3, -2, 3 },
+      { -6, 10, -6, 10, -6, 10, -6, 10 },
+      { -18, 49, -18, 49, -18, 49, -18, 49 },
+      { 108, -24, 108, -24, 108, -24, 108, -24 },
+      { 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
   },
   {
-      {
-          -1, 3, -1, 3, -1, 3, -1, 3,
-      },
-      {
-          -5, 8, -5, 8, -5, 8, -5, 8,
-      },
-      {
-          -15, 38, -15, 38, -15, 38, -15, 38,
-      },
-      {
-          115, -21, 115, -21, 115, -21, 115, -21,
-      },
-      {
-          10, -6, 10, -6, 10, -6, 10, -6,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1,
-      },
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -5, 8, -5, 8, -5, 8, -5, 8 },
+      { -15, 38, -15, 38, -15, 38, -15, 38 },
+      { 115, -21, 115, -21, 115, -21, 115, -21 },
+      { 10, -6, 10, -6, 10, -6, 10, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
   },
   {
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -4, 7, -4, 7, -4, 7, -4, 7,
-      },
-      {
-          -12, 28, -12, 28, -12, 28, -12, 28,
-      },
-      {
-          120, -18, 120, -18, 120, -18, 120, -18,
-      },
-      {
-          8, -4, 8, -4, 8, -4, 8, -4,
-      },
-      {
-          3, -1, 3, -1, 3, -1, 3, -1,
-      },
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 7, -4, 7, -4, 7, -4, 7 },
+      { -12, 28, -12, 28, -12, 28, -12, 28 },
+      { 120, -18, 120, -18, 120, -18, 120, -18 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
   },
   {
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -2, 4, -2, 4, -2, 4, -2, 4,
-      },
-      {
-          -8, 18, -8, 18, -8, 18, -8, 18,
-      },
-      {
-          124, -13, 124, -13, 124, -13, 124, -13,
-      },
-      {
-          6, -3, 6, -3, 6, -3, 6, -3,
-      },
-      {
-          2, -1, 2, -1, 2, -1, 2, -1,
-      },
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -8, 18, -8, 18, -8, 18, -8, 18 },
+      { 124, -13, 124, -13, 124, -13, 124, -13 },
+      { 6, -3, 6, -3, 6, -3, 6, -3 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
   },
   {
-      {
-          0, 1, 0, 1, 0, 1, 0, 1,
-      },
-      {
-          -1, 2, -1, 2, -1, 2, -1, 2,
-      },
-      {
-          -4, 8, -4, 8, -4, 8, -4, 8,
-      },
-      {
-          127, -7, 127, -7, 127, -7, 127, -7,
-      },
-      {
-          3, -2, 3, -2, 3, -2, 3, -2,
-      },
-      {
-          1, 0, 1, 0, 1, 0, 1, 0,
-      },
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { 127, -7, 127, -7, 127, -7, 127, -7 },
+      { 3, -2, 3, -2, 3, -2, 3, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
+  },
+};
+#endif
+#endif
+#if CONFIG_AOM_HIGHBITDEPTH
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static const int16_t,
+                sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6]
+                                                                       [8]) = {
+  {
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -7, 127, -7, 127, -7, 127, -7, 127 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
+  },
+  {
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -3, 5, -3, 5, -3, 5, -3, 5 },
+      { -12, 124, -12, 124, -12, 124, -12, 124 },
+      { 18, -8, 18, -8, 18, -8, 18, -8 },
+      { 4, -2, 4, -2, 4, -2, 4, -2 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -17, 120, -17, 120, -17, 120, -17, 120 },
+      { 28, -11, 28, -11, 28, -11, 28, -11 },
+      { 6, -3, 6, -3, 6, -3, 6, -3 },
+      { 1, -1, 1, -1, 1, -1, 1, -1 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 10, -4, 10, -4, 10, -4, 10 },
+      { -21, 114, -21, 114, -21, 114, -21, 114 },
+      { 38, -15, 38, -15, 38, -15, 38, -15 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -5, 11, -5, 11, -5, 11, -5, 11 },
+      { -23, 107, -23, 107, -23, 107, -23, 107 },
+      { 49, -18, 49, -18, 49, -18, 49, -18 },
+      { 9, -5, 9, -5, 9, -5, 9, -5 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -25, 99, -25, 99, -25, 99, -25, 99 },
+      { 60, -21, 60, -21, 60, -21, 60, -21 },
+      { 11, -6, 11, -6, 11, -6, 11, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -25, 90, -25, 90, -25, 90, -25, 90 },
+      { 70, -23, 70, -23, 70, -23, 70, -23 },
+      { 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -24, 80, -24, 80, -24, 80, -24, 80 },
+      { 80, -24, 80, -24, 80, -24, 80, -24 },
+      { 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 12, -6, 12, -6, 12, -6, 12 },
+      { -23, 70, -23, 70, -23, 70, -23, 70 },
+      { 90, -25, 90, -25, 90, -25, 90, -25 },
+      { 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 3, -1, 3, -1, 3, -1, 3 },
+      { -6, 11, -6, 11, -6, 11, -6, 11 },
+      { -21, 60, -21, 60, -21, 60, -21, 60 },
+      { 99, -25, 99, -25, 99, -25, 99, -25 },
+      { 12, -6, 12, -6, 12, -6, 12, -6 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -5, 9, -5, 9, -5, 9, -5, 9 },
+      { -18, 49, -18, 49, -18, 49, -18, 49 },
+      { 107, -23, 107, -23, 107, -23, 107, -23 },
+      { 11, -5, 11, -5, 11, -5, 11, -5 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+  },
+  {
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { -15, 38, -15, 38, -15, 38, -15, 38 },
+      { 114, -21, 114, -21, 114, -21, 114, -21 },
+      { 10, -4, 10, -4, 10, -4, 10, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { -1, 1, -1, 1, -1, 1, -1, 1 },
+      { -3, 6, -3, 6, -3, 6, -3, 6 },
+      { -11, 28, -11, 28, -11, 28, -11, 28 },
+      { 120, -17, 120, -17, 120, -17, 120, -17 },
+      { 8, -4, 8, -4, 8, -4, 8, -4 },
+      { 2, -1, 2, -1, 2, -1, 2, -1 },
+  },
+  {
+      { 0, 1, 0, 1, 0, 1, 0, 1 },
+      { -2, 4, -2, 4, -2, 4, -2, 4 },
+      { -8, 18, -8, 18, -8, 18, -8, 18 },
+      { 124, -12, 124, -12, 124, -12, 124, -12 },
+      { 5, -3, 5, -3, 5, -3, 5, -3 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
+  },
+  {
+      { 0, 0, 0, 0, 0, 0, 0, 0 },
+      { -1, 2, -1, 2, -1, 2, -1, 2 },
+      { -4, 8, -4, 8, -4, 8, -4, 8 },
+      { 127, -7, 127, -7, 127, -7, 127, -7 },
+      { 3, -1, 3, -1, 3, -1, 3, -1 },
+      { 1, 0, 1, 0, 1, 0, 1, 0 },
   },
 };
 #endif

diff --git a/av1/common/x86/av1_highbd_convolve_sse4.c b/av1/common/x86/av1_highbd_convolve_sse4.c
index 1877e08..9b5ef25 100644
--- a/av1/common/x86/av1_highbd_convolve_sse4.c
+++ b/av1/common/x86/av1_highbd_convolve_sse4.c

@@ -22,8 +22,8 @@
                               int src_stride, uint16_t *dst, int dst_stride,
                               int bd);
 
-static INLINE HbdSubpelFilterCoeffs av1_hbd_get_subpel_filter_ver_signal_dir(
-    const InterpFilterParams p, int index) {
+static INLINE HbdSubpelFilterCoeffs
+hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
 #if CONFIG_EXT_INTERP
   if (p.interp_filter == MULTITAP_SHARP2) {
     return &sub_pel_filters_12sharp_highbd_ver_signal_dir[index][0];
@@ -32,6 +32,11 @@
     return &sub_pel_filters_10sharp_highbd_ver_signal_dir[index][0];
   }
 #endif
+#if USE_TEMPORALFILTER_12TAP
+  if (p.interp_filter == TEMPORALFILTER_12TAP) {
+    return &sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[index][0];
+  }
+#endif
   (void)p;
   (void)index;
   return NULL;
@@ -253,7 +258,7 @@
   }
 
   vCoeffs =
-      av1_hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
+      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
   if (!vCoeffs) {
     av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
                                 filter_params, subpel_x_q4, x_step_q4, avg, bd);
@@ -457,7 +462,7 @@
   }
 
   vCoeffs =
-      av1_hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
+      hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
   if (!vCoeffs) {
     av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
                                filter_params, subpel_y_q4, y_step_q4, avg, bd);

diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c
deleted file mode 100644
index 44fe037..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.c
+++ /dev/null

@@ -1,4028 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "./av1_rtcd.h"
-#include "av1/common/x86/av1_inv_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-
-#define RECON_AND_STORE4X4(dest, in_x)                    \
-  {                                                       \
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                     \
-    d0 = _mm_add_epi16(in_x, d0);                         \
-    d0 = _mm_packus_epi16(d0, d0);                        \
-    *(int *)(dest) = _mm_cvtsi128_si32(d0);               \
-  }
-
-void av1_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16(
-      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
-      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i input0, input1, input2, input3;
-
-  // Rows
-  input0 = _mm_load_si128((const __m128i *)input);
-  input2 = _mm_load_si128((const __m128i *)(input + 8));
-
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input0 = _mm_shufflehi_epi16(input0, 0xd8);
-  input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input2, 0xd8);
-
-  input1 = _mm_unpackhi_epi32(input0, input0);
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input3 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpacklo_epi32(input2, input2);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input1);
-  input1 = _mm_packs_epi32(input2, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Columns
-  // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_unpacklo_epi32(input2, input2);
-  input1 = _mm_unpackhi_epi32(input2, input2);
-  input2 = _mm_unpackhi_epi32(input3, input3);
-  input3 = _mm_unpacklo_epi32(input3, input3);
-
-  // Stage 1
-  input0 = _mm_madd_epi16(input0, cst);
-  input1 = _mm_madd_epi16(input1, cst);
-  input2 = _mm_madd_epi16(input2, cst);
-  input3 = _mm_madd_epi16(input3, cst);
-
-  input0 = _mm_add_epi32(input0, rounding);
-  input1 = _mm_add_epi32(input1, rounding);
-  input2 = _mm_add_epi32(input2, rounding);
-  input3 = _mm_add_epi32(input3, rounding);
-
-  input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
-  input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
-  input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
-  input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
-
-  // Stage 2
-  input0 = _mm_packs_epi32(input0, input2);
-  input1 = _mm_packs_epi32(input1, input3);
-
-  // Transpose
-  input2 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpackhi_epi16(input0, input1);
-  input0 = _mm_unpacklo_epi32(input2, input3);
-  input1 = _mm_unpackhi_epi32(input2, input3);
-
-  // Switch column2, column 3, and then, we got:
-  // input2: column1, column 0;  input3: column2, column 3.
-  input1 = _mm_shuffle_epi32(input1, 0x4e);
-  input2 = _mm_add_epi16(input0, input1);
-  input3 = _mm_sub_epi16(input0, input1);
-
-  // Final round and shift
-  input2 = _mm_add_epi16(input2, eight);
-  input3 = _mm_add_epi16(input3, eight);
-
-  input2 = _mm_srai_epi16(input2, 4);
-  input3 = _mm_srai_epi16(input3, 4);
-
-  // Reconstruction and Store
-  {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
-    d0 = _mm_unpacklo_epi8(d0, zero);
-    d2 = _mm_unpacklo_epi8(d2, zero);
-    d0 = _mm_add_epi16(d0, input2);
-    d2 = _mm_add_epi16(d2, input3);
-    d0 = _mm_packus_epi16(d0, d2);
-    // store input0
-    *(int *)dest = _mm_cvtsi128_si32(d0);
-    // store input1
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-    // store input2
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-    // store input3
-    d0 = _mm_srli_si128(d0, 4);
-    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-  }
-}
-
-void av1_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 4);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
-  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
-}
-
-void av1_idct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
-
-  transpose_4x4(in);
-  // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
-
-  // stage 2
-  in[0] = _mm_add_epi16(u[0], u[1]);
-  in[1] = _mm_sub_epi16(u[0], u[1]);
-  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
-}
-
-void av1_iadst4_sse2(__m128i *in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  transpose_4x4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
-
-  u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
-  in[0] = _mm_packs_epi32(u[0], u[1]);
-  in[1] = _mm_packs_epi32(u[2], u[3]);
-}
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
-
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);                                \
-      stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);                                \
-      stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);                                \
-      stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);                                \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);                                \
-      stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);                                \
-      stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);                                \
-      stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);                                \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_adds_epi16(stp1_0, stp2_7);                                    \
-    out1 = _mm_adds_epi16(stp1_1, stp1_6);                                    \
-    out2 = _mm_adds_epi16(stp1_2, stp1_5);                                    \
-    out3 = _mm_adds_epi16(stp1_3, stp2_4);                                    \
-    out4 = _mm_subs_epi16(stp1_3, stp2_4);                                    \
-    out5 = _mm_subs_epi16(stp1_2, stp1_5);                                    \
-    out6 = _mm_subs_epi16(stp1_1, stp1_6);                                    \
-    out7 = _mm_subs_epi16(stp1_0, stp2_7);                                    \
-  }
-
-void av1_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from av1_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D av1_idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-void av1_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
-
-  dc_value = _mm_set1_epi16(a);
-
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
-}
-
-void av1_idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // 8x8 Transpose is copied from av1_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
-
-  // 4-stage 1D av1_idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
-}
-
-void av1_iadst8_sse2(__m128i *in) {
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
-  // transpose
-  array_transpose_8x8(in, in);
-
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
-  // column transformation
-  // stage 1
-  // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
-  // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
-
-  // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
-  // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
-
-  // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
-  // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
-
-  // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
-}
-
-void av1_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
-
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_adds_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_subs_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
-    tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_subs_epi16(stp2_0, stp2_2);
-
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  tmp0 = _mm_adds_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_adds_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_subs_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_subs_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
-
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
-
-void av1_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D av1_idct
-
-    // Load input data.
-    in[0] = _mm_load_si128((const __m128i *)input);
-    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
-    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
-    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
-    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
-    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
-    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
-    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
-    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
-    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
-    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
-    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
-    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
-    int j;
-    // 1-D av1_idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void av1_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 2; ++i) {
-    RECON_AND_STORE(dest + 0 * stride, dc_value);
-    RECON_AND_STORE(dest + 1 * stride, dc_value);
-    RECON_AND_STORE(dest + 2 * stride, dc_value);
-    RECON_AND_STORE(dest + 3 * stride, dc_value);
-    RECON_AND_STORE(dest + 4 * stride, dc_value);
-    RECON_AND_STORE(dest + 5 * stride, dc_value);
-    RECON_AND_STORE(dest + 6 * stride, dc_value);
-    RECON_AND_STORE(dest + 7 * stride, dc_value);
-    RECON_AND_STORE(dest + 8 * stride, dc_value);
-    RECON_AND_STORE(dest + 9 * stride, dc_value);
-    RECON_AND_STORE(dest + 10 * stride, dc_value);
-    RECON_AND_STORE(dest + 11 * stride, dc_value);
-    RECON_AND_STORE(dest + 12 * stride, dc_value);
-    RECON_AND_STORE(dest + 13 * stride, dc_value);
-    RECON_AND_STORE(dest + 14 * stride, dc_value);
-    RECON_AND_STORE(dest + 15 * stride, dc_value);
-    dest += 8;
-  }
-}
-
-static void av1_iadst16_8col(__m128i *in) {
-  // perform 16x16 1-D ADST for 8 columns
-  __m128i s[16], x[16], u[32], v[32];
-  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
-  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
-
-  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
-  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
-  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
-  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
-  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
-  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
-  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
-  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
-  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
-  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
-  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
-  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
-  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
-  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
-  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
-  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
-  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
-  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
-  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
-  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
-  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
-  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
-  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
-  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
-  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
-  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
-  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
-  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
-  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
-  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
-  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
-  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
-  u[0] = _mm_add_epi32(v[0], v[16]);
-  u[1] = _mm_add_epi32(v[1], v[17]);
-  u[2] = _mm_add_epi32(v[2], v[18]);
-  u[3] = _mm_add_epi32(v[3], v[19]);
-  u[4] = _mm_add_epi32(v[4], v[20]);
-  u[5] = _mm_add_epi32(v[5], v[21]);
-  u[6] = _mm_add_epi32(v[6], v[22]);
-  u[7] = _mm_add_epi32(v[7], v[23]);
-  u[8] = _mm_add_epi32(v[8], v[24]);
-  u[9] = _mm_add_epi32(v[9], v[25]);
-  u[10] = _mm_add_epi32(v[10], v[26]);
-  u[11] = _mm_add_epi32(v[11], v[27]);
-  u[12] = _mm_add_epi32(v[12], v[28]);
-  u[13] = _mm_add_epi32(v[13], v[29]);
-  u[14] = _mm_add_epi32(v[14], v[30]);
-  u[15] = _mm_add_epi32(v[15], v[31]);
-  u[16] = _mm_sub_epi32(v[0], v[16]);
-  u[17] = _mm_sub_epi32(v[1], v[17]);
-  u[18] = _mm_sub_epi32(v[2], v[18]);
-  u[19] = _mm_sub_epi32(v[3], v[19]);
-  u[20] = _mm_sub_epi32(v[4], v[20]);
-  u[21] = _mm_sub_epi32(v[5], v[21]);
-  u[22] = _mm_sub_epi32(v[6], v[22]);
-  u[23] = _mm_sub_epi32(v[7], v[23]);
-  u[24] = _mm_sub_epi32(v[8], v[24]);
-  u[25] = _mm_sub_epi32(v[9], v[25]);
-  u[26] = _mm_sub_epi32(v[10], v[26]);
-  u[27] = _mm_sub_epi32(v[11], v[27]);
-  u[28] = _mm_sub_epi32(v[12], v[28]);
-  u[29] = _mm_sub_epi32(v[13], v[29]);
-  u[30] = _mm_sub_epi32(v[14], v[30]);
-  u[31] = _mm_sub_epi32(v[15], v[31]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
-  s[8] = _mm_packs_epi32(u[16], u[17]);
-  s[9] = _mm_packs_epi32(u[18], u[19]);
-  s[10] = _mm_packs_epi32(u[20], u[21]);
-  s[11] = _mm_packs_epi32(u[22], u[23]);
-  s[12] = _mm_packs_epi32(u[24], u[25]);
-  s[13] = _mm_packs_epi32(u[26], u[27]);
-  s[14] = _mm_packs_epi32(u[28], u[29]);
-  s[15] = _mm_packs_epi32(u[30], u[31]);
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
-  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], v[8]);
-  u[1] = _mm_add_epi32(v[1], v[9]);
-  u[2] = _mm_add_epi32(v[2], v[10]);
-  u[3] = _mm_add_epi32(v[3], v[11]);
-  u[4] = _mm_add_epi32(v[4], v[12]);
-  u[5] = _mm_add_epi32(v[5], v[13]);
-  u[6] = _mm_add_epi32(v[6], v[14]);
-  u[7] = _mm_add_epi32(v[7], v[15]);
-  u[8] = _mm_sub_epi32(v[0], v[8]);
-  u[9] = _mm_sub_epi32(v[1], v[9]);
-  u[10] = _mm_sub_epi32(v[2], v[10]);
-  u[11] = _mm_sub_epi32(v[3], v[11]);
-  u[12] = _mm_sub_epi32(v[4], v[12]);
-  u[13] = _mm_sub_epi32(v[5], v[13]);
-  u[14] = _mm_sub_epi32(v[6], v[14]);
-  u[15] = _mm_sub_epi32(v[7], v[15]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
-  x[12] = _mm_packs_epi32(u[8], u[9]);
-  x[13] = _mm_packs_epi32(u[10], u[11]);
-  x[14] = _mm_packs_epi32(u[12], u[13]);
-  x[15] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
-  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
-  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
-  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
-  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], v[4]);
-  u[1] = _mm_add_epi32(v[1], v[5]);
-  u[2] = _mm_add_epi32(v[2], v[6]);
-  u[3] = _mm_add_epi32(v[3], v[7]);
-  u[4] = _mm_sub_epi32(v[0], v[4]);
-  u[5] = _mm_sub_epi32(v[1], v[5]);
-  u[6] = _mm_sub_epi32(v[2], v[6]);
-  u[7] = _mm_sub_epi32(v[3], v[7]);
-  u[8] = _mm_add_epi32(v[8], v[12]);
-  u[9] = _mm_add_epi32(v[9], v[13]);
-  u[10] = _mm_add_epi32(v[10], v[14]);
-  u[11] = _mm_add_epi32(v[11], v[15]);
-  u[12] = _mm_sub_epi32(v[8], v[12]);
-  u[13] = _mm_sub_epi32(v[9], v[13]);
-  u[14] = _mm_sub_epi32(v[10], v[14]);
-  u[15] = _mm_sub_epi32(v[11], v[15]);
-
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
-  s[4] = _mm_packs_epi32(v[0], v[1]);
-  s[5] = _mm_packs_epi32(v[2], v[3]);
-  s[6] = _mm_packs_epi32(v[4], v[5]);
-  s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
-  s[12] = _mm_packs_epi32(v[8], v[9]);
-  s[13] = _mm_packs_epi32(v[10], v[11]);
-  s[14] = _mm_packs_epi32(v[12], v[13]);
-  s[15] = _mm_packs_epi32(v[14], v[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
-  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
-  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
-  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
-  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
-  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  in[0] = s[0];
-  in[1] = _mm_sub_epi16(kZero, s[8]);
-  in[2] = s[12];
-  in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
-  in[12] = s[5];
-  in[13] = _mm_sub_epi16(kZero, s[13]);
-  in[14] = s[9];
-  in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void av1_idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-void av1_idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  av1_idct16_8col(in0);
-  av1_idct16_8col(in1);
-}
-
-void av1_iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  av1_iadst16_8col(in0);
-  av1_iadst16_8col(in1);
-}
-
-void av1_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
-
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-#define LOAD_DQCOEFF(reg, input)                  \
-  {                                               \
-    reg = _mm_load_si128((const __m128i *)input); \
-    input += 8;                                   \
-  }
-
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-#define IDCT32                                                                 \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
-
-// Only upper-left 8x8 has non-zero coeff
-void av1_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // av1_idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 32));
-  in[2] = _mm_load_si128((const __m128i *)(input + 64));
-  in[3] = _mm_load_si128((const __m128i *)(input + 96));
-  in[4] = _mm_load_si128((const __m128i *)(input + 128));
-  in[5] = _mm_load_si128((const __m128i *)(input + 160));
-  in[6] = _mm_load_si128((const __m128i *)(input + 192));
-  in[7] = _mm_load_si128((const __m128i *)(input + 224));
-
-  for (i = 8; i < 32; ++i) {
-    in[i] = _mm_setzero_si128();
-  }
-
-  array_transpose_8x8(in, in);
-  // TODO(hkuang): Following transposes are unnecessary. But remove them will
-  // lead to performance drop on some devices.
-  array_transpose_8x8(in + 8, in + 8);
-  array_transpose_8x8(in + 16, in + 16);
-  array_transpose_8x8(in + 24, in + 24);
-
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void av1_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
-                                 int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // av1_idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D av1_idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-  }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D av1_idct
-    j = i << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
-
-    dest += 8;
-  }
-}
-
-void av1_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = dct_const_round_shift(input[0] * cospi_16_64);
-  a = dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
-
-  for (i = 0; i < 4; ++i) {
-    int j;
-    for (j = 0; j < 32; ++j) {
-      RECON_AND_STORE(dest + j * stride, dc_value);
-    }
-    dest += 8;
-  }
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
-}
-
-void av1_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      av1_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-      av1_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      av1_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      av1_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                    int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      av1_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      av1_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                       int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      av1_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      av1_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-
-void av1_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
-                                      int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      av1_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
-
-  if (optimised_cols) {
-    av1_idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      av1_highbd_idct16_c(temp_in, temp_out, bd);
-      for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
-      }
-    }
-  }
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH

diff --git a/av1/common/x86/av1_inv_txfm_sse2.h b/av1/common/x86/av1_inv_txfm_sse2.h
deleted file mode 100644
index a8bb6c1..0000000
--- a/av1/common/x86/av1_inv_txfm_sse2.h
+++ /dev/null

@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_INV_TXFM_SSE2_H_
-#define AOM_DSP_X86_INV_TXFM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-#include "./aom_config.h"
-#include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
-
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-}
-
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
-
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
-static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
-  in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
-  in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
-  in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
-  in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
-  in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
-  in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
-  in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
-  in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
-
-  in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
-  in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
-  in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
-  in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
-  in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
-  in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
-  in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
-  in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
-}
-
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
-  }
-
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
-}
-
-#endif  // AOM_DSP_X86_INV_TXFM_SSE2_H_

diff --git a/av1/common/x86/av1_txfm1d_sse4.h b/av1/common/x86/av1_txfm1d_sse4.h
new file mode 100644
index 0000000..af7afb7
--- /dev/null
+++ b/av1/common/x86/av1_txfm1d_sse4.h

@@ -0,0 +1,144 @@
+#ifndef AV1_TXMF1D_SSE2_H_
+#define AV1_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+                          const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+                           const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
+                            const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+                                    __m128i *output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// than transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+                                __m128i *output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
+                                               const int size, const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AV1_TXMF1D_SSE2_H_

diff --git a/av1/common/x86/filterintra_sse4.c b/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 0000000..86b1c80
--- /dev/null
+++ b/av1/common/x86/filterintra_sse4.c

@@ -0,0 +1,889 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
+                                  __m128i *sum) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u0 = _mm_unpacklo_epi8(a, zero);
+  __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(u0, u1);
+}
+
+static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
+                                  __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsSmall(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 4;
+  sum_value >>= 3;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
+                                  __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsSmall(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 8;
+  sum_value >>= 4;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
+                                  __m128i *sum) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u0 = _mm_unpacklo_epi8(a, zero);
+  __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(u0, u1);
+
+  u0 = _mm_unpackhi_epi8(a, zero);
+  u1 = _mm_unpackhi_epi8(l, zero);
+
+  sum[0] = _mm_add_epi16(sum[0], u0);
+  sum[0] = _mm_add_epi16(sum[0], u1);
+}
+
+static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
+                                    __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+
+  AddPixelsLarge(above, left, &sum_vector);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 16;
+  sum_value >>= 5;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
+                                    __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector[2], u;
+  uint16_t sum_value;
+
+  AddPixelsLarge(above, left, &sum_vector[0]);
+  AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
+
+  sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
+  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector[0], 2);
+  sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+
+  sum_value = _mm_extract_epi16(sum_vector[0], 0);
+  sum_value += 32;
+  sum_value >>= 6;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
+                                         const uint8_t *left, int bs,
+                                         __m128i *params) {
+  int meanValue = 0;
+  switch (bs) {
+    case 4: meanValue = GetMeanValue4x4(above, left, params); break;
+    case 8: meanValue = GetMeanValue8x8(above, left, params); break;
+    case 16: meanValue = GetMeanValue16x16(above, left, params); break;
+    case 32: meanValue = GetMeanValue32x32(above, left, params); break;
+    default: assert(0);
+  }
+  return meanValue;
+}
+
+// Note:
+//  params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
+//
+static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
+  const TX_SIZE tx_size =
+      (bs == 32) ? TX_32X32
+                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  // c0
+  params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0],
+                            av1_filter_intra_taps_4[tx_size][mode][0]);
+  // c1
+  params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1],
+                            av1_filter_intra_taps_4[tx_size][mode][1]);
+  // c2
+  params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2],
+                            av1_filter_intra_taps_4[tx_size][mode][2]);
+  // c3
+  params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3],
+                            av1_filter_intra_taps_4[tx_size][mode][3]);
+}
+
+static const int maxBlkSize = 32;
+
+static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
+                               ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
+  __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
+  __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
+
+  p0 = _mm_add_epi32(p0, mean[0]);
+  p1 = _mm_add_epi32(p1, mean[0]);
+  p2 = _mm_add_epi32(p2, mean[0]);
+  p3 = _mm_add_epi32(p3, mean[0]);
+
+  p0 = _mm_packus_epi32(p0, p1);
+  p1 = _mm_packus_epi32(p2, p3);
+  p0 = _mm_packus_epi16(p0, p1);
+
+  *((int *)dst) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
+  p0 = _mm_srli_si128(p0, 4);
+  *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
+}
+
+static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
+                        ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3;
+  int r = 0;
+
+  while (r < 8) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    r += 1;
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    dst += stride;
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)dst, p0);
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
+                          ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3;
+  int r = 0;
+
+  while (r < 16) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)(dst + 8), p0);
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
+                          ptrdiff_t stride) {
+  const int predStride = (maxBlkSize << 1) + 1;
+  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+  int r = 0;
+
+  while (r < 32) {
+    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+    p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
+    p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
+    p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
+    p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
+
+    p0 = _mm_add_epi32(p0, mean[0]);
+    p1 = _mm_add_epi32(p1, mean[0]);
+    p2 = _mm_add_epi32(p2, mean[0]);
+    p3 = _mm_add_epi32(p3, mean[0]);
+
+    p4 = _mm_add_epi32(p4, mean[0]);
+    p5 = _mm_add_epi32(p5, mean[0]);
+    p6 = _mm_add_epi32(p6, mean[0]);
+    p7 = _mm_add_epi32(p7, mean[0]);
+
+    p0 = _mm_packus_epi32(p0, p1);
+    p1 = _mm_packus_epi32(p2, p3);
+    p0 = _mm_packus_epi16(p0, p1);
+
+    p4 = _mm_packus_epi32(p4, p5);
+    p5 = _mm_packus_epi32(p6, p7);
+    p4 = _mm_packus_epi16(p4, p5);
+
+    _mm_storel_epi64((__m128i *)dst, p0);
+    p0 = _mm_srli_si128(p0, 8);
+    _mm_storel_epi64((__m128i *)(dst + 8), p0);
+
+    _mm_storel_epi64((__m128i *)(dst + 16), p4);
+    p4 = _mm_srli_si128(p4, 8);
+    _mm_storel_epi64((__m128i *)(dst + 24), p4);
+
+    dst += stride;
+    r += 1;
+  }
+}
+
+static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
+                           ptrdiff_t stride) {
+  switch (bs) {
+    case 4: SavePred4x4(pred, mean, dst, stride); break;
+    case 8: SavePred8x8(pred, mean, dst, stride); break;
+    case 16: SavePred16x16(pred, mean, dst, stride); break;
+    case 32: SavePred32x32(pred, mean, dst, stride); break;
+    default: assert(0);
+  }
+}
+
+typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
+                                  const int predStride);
+
+static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
+                              const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+
+  sum = _mm_extract_epi32(u0, 2);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 3) = x;
+
+  sum = _mm_extract_epi32(u0, 3);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 4) = x;
+}
+
+static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
+                               const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+
+  sum = _mm_extract_epi32(u0, 2);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 3) = x;
+}
+
+static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
+                             const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+
+  sum = _mm_extract_epi32(u0, 1);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 2) = x;
+}
+
+static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
+                             const int predStride) {
+  __m128i u0, u1, u2;
+  int c0 = _mm_extract_epi32(prm[1], 0);
+  int x = *(pred + predStride);
+  int sum;
+
+  u0 = _mm_mullo_epi32(p[0], prm[2]);
+  u1 = _mm_mullo_epi32(p[1], prm[0]);
+  u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+  u0 = _mm_add_epi32(u0, u1);
+  u0 = _mm_add_epi32(u0, u2);
+
+  sum = _mm_extract_epi32(u0, 0);
+  sum += c0 * x;
+  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+  *(pred + predStride + 1) = x;
+}
+
+static ProducePixelsFunc prodPixelsFuncTab[4] = {
+  ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
+};
+
+static void ProducePixels(int *pred, const __m128i *prm, int remain) {
+  __m128i p[3];
+  const int predStride = (maxBlkSize << 1) + 1;
+  int index;
+
+  p[0] = _mm_loadu_si128((const __m128i *)pred);
+  p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
+  p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
+
+  if (remain <= 2) {
+    return;
+  }
+  if (remain > 5) {
+    index = 3;
+  } else {
+    index = remain - 3;
+  }
+  prodPixelsFuncTab[index](p, prm, pred, predStride);
+}
+
+// Note:
+//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
+                               const int bs, const __m128i *prm, int meanValue,
+                               uint8_t *dst, ptrdiff_t stride) {
+  int pred[33][65];
+  int r, c, colBound;
+  int remainings;
+
+  for (r = 0; r < bs; ++r) {
+    pred[r + 1][0] = (int)left[r] - meanValue;
+  }
+
+  above -= 1;
+  for (c = 0; c < 2 * bs + 1; ++c) {
+    pred[0][c] = (int)above[c] - meanValue;
+  }
+
+  r = 0;
+  c = 0;
+  while (r < bs) {
+    colBound = (bs << 1) - r;
+    for (c = 0; c < colBound; c += 4) {
+      remainings = colBound - c + 1;
+      ProducePixels(&pred[r][c], prm, remainings);
+    }
+    r += 1;
+  }
+
+  SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
+}
+
+static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
+                             __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
+  int meanValue = 0;
+  meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
+  GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
+}
+
+void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, V_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, H_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void av1_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+  FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+// ============== High Bit Depth ==============
+#if CONFIG_AOM_HIGHBITDEPTH
+static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 4;
+  sum_value >>= 3;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
+                                        const uint16_t *left, const int bd,
+                                        __m128i *params) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)above);
+  const __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint16_t sum_value;
+  (void)bd;
+
+  sum_vector = _mm_add_epi16(a, l);
+
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+  u = _mm_srli_si128(sum_vector, 2);
+  sum_vector = _mm_add_epi16(sum_vector, u);
+
+  sum_value = _mm_extract_epi16(sum_vector, 0);
+  sum_value += 8;
+  sum_value >>= 4;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  Process 16 pixels above and left, 10-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
+                                  __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  sum[0] = _mm_add_epi16(a, l);
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+  sum[0] = _mm_add_epi16(sum[0], a);
+  sum[0] = _mm_add_epi16(sum[0], l);
+}
+
+// Note:
+//  Process 16 pixels above and left, 12-bit depth
+//  Add to the last 8 pixels sum
+static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
+                                  __m128i *sum) {
+  __m128i a = _mm_loadu_si128((const __m128i *)above);
+  __m128i l = _mm_loadu_si128((const __m128i *)left);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i v0, v1;
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(v0, v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  a = _mm_loadu_si128((const __m128i *)(above + 8));
+  l = _mm_loadu_si128((const __m128i *)(left + 8));
+
+  v0 = _mm_unpacklo_epi16(a, zero);
+  v1 = _mm_unpacklo_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+
+  v0 = _mm_unpackhi_epi16(a, zero);
+  v1 = _mm_unpackhi_epi16(l, zero);
+  sum[0] = _mm_add_epi32(sum[0], v0);
+  sum[0] = _mm_add_epi32(sum[0], v1);
+}
+
+static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector, u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector);
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
+    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector, 2);
+    sum_vector = _mm_add_epi16(sum_vector, u);
+    sum_value = _mm_extract_epi16(sum_vector, 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector);
+
+    sum_vector = _mm_hadd_epi32(sum_vector, zero);
+    u = _mm_srli_si128(sum_vector, 4);
+    sum_vector = _mm_add_epi32(u, sum_vector);
+    sum_value = _mm_extract_epi32(sum_vector, 0);
+  }
+
+  sum_value += 16;
+  sum_value >>= 5;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
+                                          const uint16_t *left, const int bd,
+                                          __m128i *params) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i sum_vector[2], u;
+  uint32_t sum_value = 0;
+
+  if (10 == bd) {
+    AddPixels10bit(above, left, &sum_vector[0]);
+    AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
+    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
+
+    u = _mm_srli_si128(sum_vector[0], 2);
+    sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+    sum_value = _mm_extract_epi16(sum_vector[0], 0);
+  } else if (12 == bd) {
+    AddPixels12bit(above, left, &sum_vector[0]);
+    AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
+
+    sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
+    sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
+    u = _mm_srli_si128(sum_vector[0], 4);
+    sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
+    sum_value = _mm_extract_epi32(sum_vector[0], 0);
+  }
+
+  sum_value += 32;
+  sum_value >>= 6;
+  *params = _mm_set1_epi32(sum_value);
+  return sum_value;
+}
+
+// Note:
+//  params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
+                                               const uint16_t *left, int bs,
+                                               const int bd, __m128i *params) {
+  int meanValue = 0;
+  switch (bs) {
+    case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
+    case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
+    case 16:
+      meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
+      break;
+    case 32:
+      meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
+      break;
+    default: assert(0);
+  }
+  return meanValue;
+}
+
+// Note:
+//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void HighbdGeneratePrediction(const uint16_t *above,
+                                     const uint16_t *left, const int bs,
+                                     const int bd, const __m128i *prm,
+                                     int meanValue, uint16_t *dst,
+                                     ptrdiff_t stride) {
+  int pred[33][65];
+  int r, c, colBound;
+  int remainings;
+  int ipred;
+
+  for (r = 0; r < bs; ++r) {
+    pred[r + 1][0] = (int)left[r] - meanValue;
+  }
+
+  above -= 1;
+  for (c = 0; c < 2 * bs + 1; ++c) {
+    pred[0][c] = (int)above[c] - meanValue;
+  }
+
+  r = 0;
+  c = 0;
+  while (r < bs) {
+    colBound = (bs << 1) - r;
+    for (c = 0; c < colBound; c += 4) {
+      remainings = colBound - c + 1;
+      ProducePixels(&pred[r][c], prm, remainings);
+    }
+    r += 1;
+  }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + meanValue;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+
+static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
+                                   int bs, const int bd, __m128i *prm,
+                                   uint16_t *dst, ptrdiff_t stride) {
+  int meanValue = 0;
+  meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
+  HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
+}
+
+void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, V_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                          int bs, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, H_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                             int bs, const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                            int bs, const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+
+void av1_highbd_tm_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
+                                           int bs, const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  __m128i prm[5];
+  GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH

diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 0000000..eada3af
--- /dev/null
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c

@@ -0,0 +1,1398 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u1, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u1, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  in[0] = _mm_add_epi32(v0, v3);
+  in[1] = _mm_add_epi32(v1, v2);
+  in[2] = _mm_sub_epi32(v1, v2);
+  in[3] = _mm_sub_epi32(v0, v3);
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  // stage 0
+  // stage 1
+  u1 = _mm_sub_epi32(zero, u1);
+  u3 = _mm_sub_epi32(zero, u3);
+
+  // stage 2
+  v0 = u0;
+  v1 = u3;
+  x = _mm_mullo_epi32(u1, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  v3 = _mm_sub_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(v0, v2);
+  u1 = _mm_add_epi32(v1, v3);
+  u2 = _mm_sub_epi32(v0, v2);
+  u3 = _mm_sub_epi32(v1, v3);
+
+  // stage 4
+  x = _mm_mullo_epi32(u0, cospi8);
+  y = _mm_mullo_epi32(u1, cospi56);
+  in[3] = _mm_add_epi32(x, y);
+  in[3] = _mm_add_epi32(in[3], rnding);
+  in[3] = _mm_srai_epi32(in[3], bit);
+
+  x = _mm_mullo_epi32(u0, cospi56);
+  y = _mm_mullo_epi32(u1, cospim8);
+  in[0] = _mm_add_epi32(x, y);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
+
+  x = _mm_mullo_epi32(u2, cospi40);
+  y = _mm_mullo_epi32(u3, cospi24);
+  in[1] = _mm_add_epi32(x, y);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[1] = _mm_srai_epi32(in[1], bit);
+
+  x = _mm_mullo_epi32(u2, cospi24);
+  y = _mm_mullo_epi32(u3, cospim40);
+  in[2] = _mm_add_epi32(x, y);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[2] = _mm_srai_epi32(in[2], bit);
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[3] = _mm_add_epi32(in[3], rnding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  round_shift_4x4(in, shift);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+  v0 = _mm_unpacklo_epi16(v0, zero);
+  v1 = _mm_unpacklo_epi16(v1, zero);
+  v2 = _mm_unpacklo_epi16(v2, zero);
+  v3 = _mm_unpacklo_epi16(v3, zero);
+
+  if (fliplr) {
+    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+  }
+
+  if (flipud) {
+    u0 = _mm_add_epi32(in[3], v0);
+    u1 = _mm_add_epi32(in[2], v1);
+    u2 = _mm_add_epi32(in[1], v2);
+    u3 = _mm_add_epi32(in[0], v3);
+  } else {
+    u0 = _mm_add_epi32(in[0], v0);
+    u1 = _mm_add_epi32(in[1], v1);
+    u2 = _mm_add_epi32(in[2], v2);
+    u3 = _mm_add_epi32(in[3], v3);
+  }
+
+  v0 = _mm_packus_epi32(u0, u1);
+  v2 = _mm_packus_epi32(u2, u3);
+
+  u0 = highbd_clamp_epi16(v0, bd);
+  u2 = highbd_clamp_epi16(v2, bd);
+
+  v0 = _mm_unpacklo_epi64(u0, u0);
+  v1 = _mm_unpackhi_epi64(u0, u0);
+  v2 = _mm_unpacklo_epi64(u2, u2);
+  v3 = _mm_unpackhi_epi64(u2, u2);
+
+  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+  in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+  in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+  in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+  in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+  in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+  in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+  in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+  in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+  in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+  in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+  in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+  in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+    u4 = _mm_add_epi32(x, y);
+    u4 = _mm_add_epi32(u4, rnding);
+    u4 = _mm_srai_epi32(u4, bit);
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+    u7 = _mm_add_epi32(x, y);
+    u7 = _mm_add_epi32(u7, rnding);
+    u7 = _mm_srai_epi32(u7, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+    u5 = _mm_add_epi32(x, y);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+    u6 = _mm_add_epi32(x, y);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    // stage 3
+    x = _mm_mullo_epi32(u0, cospi32);
+    y = _mm_mullo_epi32(u1, cospi32);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    v1 = _mm_sub_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi48);
+    y = _mm_mullo_epi32(u3, cospim16);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi16);
+    y = _mm_mullo_epi32(u3, cospi48);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = _mm_add_epi32(u4, u5);
+    v5 = _mm_sub_epi32(u4, u5);
+    v6 = _mm_sub_epi32(u7, u6);
+    v7 = _mm_add_epi32(u6, u7);
+
+    // stage 4
+    u0 = _mm_add_epi32(v0, v3);
+    u1 = _mm_add_epi32(v1, v2);
+    u2 = _mm_sub_epi32(v1, v2);
+    u3 = _mm_sub_epi32(v0, v3);
+    u4 = v4;
+    u7 = v7;
+
+    x = _mm_mullo_epi32(v5, cospi32);
+    y = _mm_mullo_epi32(v6, cospi32);
+    u6 = _mm_add_epi32(y, x);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    u5 = _mm_sub_epi32(y, x);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    // stage 5
+    out[0 * 2 + col] = _mm_add_epi32(u0, u7);
+    out[1 * 2 + col] = _mm_add_epi32(u1, u6);
+    out[2 * 2 + col] = _mm_add_epi32(u2, u5);
+    out[3 * 2 + col] = _mm_add_epi32(u3, u4);
+    out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
+    out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
+    out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
+    out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+  }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+                             int fliplr, int bd) {
+  __m128i x0, x1;
+  const __m128i zero = _mm_setzero_si128();
+
+  x0 = _mm_unpacklo_epi16(pred, zero);
+  x1 = _mm_unpackhi_epi16(pred, zero);
+
+  if (fliplr) {
+    res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+    res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+    x0 = _mm_add_epi32(res_hi, x0);
+    x1 = _mm_add_epi32(res_lo, x1);
+
+  } else {
+    x0 = _mm_add_epi32(res_lo, x0);
+    x1 = _mm_add_epi32(res_hi, x1);
+  }
+
+  x0 = _mm_packus_epi32(x0, x1);
+  return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  round_shift_8x8(in, shift);
+
+  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+                                   int stride, int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+// 16x16
+static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
+  int i;
+  for (i = 0; i < 64; ++i) {
+    in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+  }
+}
+
+static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
+                                        int col) {
+  int i;
+  for (i = 0; i < 16; i += 2) {
+    in8x8[i] = in[col];
+    in8x8[i + 1] = in[col + 1];
+    col += 4;
+  }
+}
+
+static void swap_addr(uint16_t **output1, uint16_t **output2) {
+  uint16_t *tmp;
+  tmp = *output1;
+  *output1 = *output2;
+  *output2 = tmp;
+}
+
+static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in8x8[16];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[8];
+  uint16_t *leftDown = &output[8 * stride];
+  uint16_t *rightDown = &output[8 * stride + 8];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 0);
+  write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 2);
+  write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 32);
+  write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 34);
+  write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = in[8 * 4 + col];
+    u[2] = in[4 * 4 + col];
+    u[3] = in[12 * 4 + col];
+    u[4] = in[2 * 4 + col];
+    u[5] = in[10 * 4 + col];
+    u[6] = in[6 * 4 + col];
+    u[7] = in[14 * 4 + col];
+    u[8] = in[1 * 4 + col];
+    u[9] = in[9 * 4 + col];
+    u[10] = in[5 * 4 + col];
+    u[11] = in[13 * 4 + col];
+    u[12] = in[3 * 4 + col];
+    u[13] = in[11 * 4 + col];
+    u[14] = in[7 * 4 + col];
+    u[15] = in[15 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
+    u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
+    u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
+    u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[10], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[14], v[15]);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[6], u[7]);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[13], v[14]);
+    u[15] = _mm_add_epi32(v[12], v[15]);
+
+    // stage 6
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
+    out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
+    out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
+    out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
+    out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
+    out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
+    out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
+    out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
+    out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
+    out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
+    out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
+    out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
+    out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
+    out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
+    out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
+    out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+    u[3] = in[8 * 4 + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+    u[5] = in[12 * 4 + col];
+    u[6] = in[4 * 4 + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+    u[9] = in[14 * 4 + col];
+    u[10] = in[6 * 4 + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+    u[12] = in[2 * 4 + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+    u[15] = in[10 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
+    v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
+    v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
+    v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
+    v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+
+    // stage 9
+    out[0 * 4 + col] = v[1];
+    out[1 * 4 + col] = v[14];
+    out[2 * 4 + col] = v[3];
+    out[3 * 4 + col] = v[12];
+    out[4 * 4 + col] = v[5];
+    out[5 * 4 + col] = v[10];
+    out[6 * 4 + col] = v[7];
+    out[7 * 4 + col] = v[8];
+    out[8 * 4 + col] = v[9];
+    out[9 * 4 + col] = v[6];
+    out[10 * 4 + col] = v[11];
+    out[11 * 4 + col] = v[4];
+    out[12 * 4 + col] = v[13];
+    out[13 * 4 + col] = v[2];
+    out[14 * 4 + col] = v[15];
+    out[15 * 4 + col] = v[0];
+  }
+}
+
+static void round_shift_16x16(__m128i *in, int shift) {
+  round_shift_8x8(&in[0], shift);
+  round_shift_8x8(&in[16], shift);
+  round_shift_8x8(&in[32], shift);
+  round_shift_8x8(&in[48], shift);
+}
+
+void av1_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
+                                     int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif
+    default: assert(0);
+  }
+}

diff --git a/av1/common/x86/highbd_txfm_utility_sse4.h b/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 0000000..f1e298d
--- /dev/null
+++ b/av1/common/x86/highbd_txfm_utility_sse4.h

@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
+#define _HIGHBD_TXFM_UTILITY_SSE4_H
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+  do {                                                \
+    __m128i u0, u1, u2, u3;                           \
+    u0 = _mm_unpacklo_epi32(x0, x1);                  \
+    u1 = _mm_unpackhi_epi32(x0, x1);                  \
+    u2 = _mm_unpacklo_epi32(x2, x3);                  \
+    u3 = _mm_unpackhi_epi32(x2, x3);                  \
+    y0 = _mm_unpacklo_epi64(u0, u2);                  \
+    y1 = _mm_unpackhi_epi64(u0, u2);                  \
+    y2 = _mm_unpacklo_epi64(u1, u3);                  \
+    y3 = _mm_unpackhi_epi64(u1, u3);                  \
+  } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+                out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+                out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+                out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+                out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+                out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+                out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+                out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+                out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+                out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+                out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+                out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+                out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+                out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+                out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+                out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+                out[63]);
+}
+
+// Note:
+//  rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0, __m128i w1,
+                                      __m128i n1, __m128i rounding, int bit) {
+  __m128i x, y;
+
+  x = _mm_mullo_epi32(w0, n0);
+  y = _mm_mullo_epi32(w1, n1);
+  x = _mm_add_epi32(x, y);
+  x = _mm_add_epi32(x, rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+#endif  // _HIGHBD_TXFM_UTILITY_SSE4_H

diff --git a/av1/common/x86/hybrid_inv_txfm_avx2.c b/av1/common/x86/hybrid_inv_txfm_avx2.c
new file mode 100644
index 0000000..754152c
--- /dev/null
+++ b/av1/common/x86/hybrid_inv_txfm_avx2.c

@@ -0,0 +1,503 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // avx2
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  *in = _mm256_setr_epi16(
+      (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+      (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+      (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+      (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+      (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+      (int16_t)coeff[15]);
+#else
+  *in = _mm256_loadu_si256((const __m256i *)coeff);
+#endif
+}
+
+static void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
+  int i = 0;
+  while (i < 16) {
+    load_coeff(coeff + (i << 4), &in[i]);
+    i += 1;
+  }
+}
+
+static void recon_and_store(const __m256i *res, uint8_t *output) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i x = _mm_loadu_si128((__m128i const *)output);
+  __m128i p0 = _mm_unpacklo_epi8(x, zero);
+  __m128i p1 = _mm_unpackhi_epi8(x, zero);
+
+  p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
+  p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
+  x = _mm_packus_epi16(p0, p1);
+  _mm_storeu_si128((__m128i *)output, x);
+}
+
+#define IDCT_ROUNDING_POS (6)
+
+static void write_buffer_16x16(__m256i *in, const int stride, uint8_t *output) {
+  const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
+  int i = 0;
+
+  while (i < 16) {
+    in[i] = _mm256_add_epi16(in[i], rounding);
+    in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
+    recon_and_store(&in[i], output + i * stride);
+    i += 1;
+  }
+}
+
+static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
+                                     const __m256i *c0, const __m256i *c1,
+                                     __m256i *b0, __m256i *b1) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  *b0 = butter_fly(x0, x1, *c0);
+  *b1 = butter_fly(x0, x1, *c1);
+}
+
+static void idct16_avx2(__m256i *in) {
+  const __m256i cospi_p30_m02 = pair256_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m256i cospi_p02_p30 = pair256_set_epi16(cospi_2_64, cospi_30_64);
+  const __m256i cospi_p14_m18 = pair256_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m256i cospi_p18_p14 = pair256_set_epi16(cospi_18_64, cospi_14_64);
+  const __m256i cospi_p22_m10 = pair256_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m256i cospi_p10_p22 = pair256_set_epi16(cospi_10_64, cospi_22_64);
+  const __m256i cospi_p06_m26 = pair256_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m256i cospi_p26_p06 = pair256_set_epi16(cospi_26_64, cospi_6_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+
+  // stage 1, (0-7)
+  u0 = in[0];
+  u1 = in[8];
+  u2 = in[4];
+  u3 = in[12];
+  u4 = in[2];
+  u5 = in[10];
+  u6 = in[6];
+  u7 = in[14];
+
+  // stage 2, (0-7)
+  // stage 3, (0-7)
+  t0 = u0;
+  t1 = u1;
+  t2 = u2;
+  t3 = u3;
+  unpack_butter_fly(&u4, &u7, &cospi_p28_m04, &cospi_p04_p28, &t4, &t7);
+  unpack_butter_fly(&u5, &u6, &cospi_p12_m20, &cospi_p20_p12, &t5, &t6);
+
+  // stage 4, (0-7)
+  unpack_butter_fly(&t0, &t1, &cospi_p16_p16, &cospi_p16_m16, &u0, &u1);
+  unpack_butter_fly(&t2, &t3, &cospi_p24_m08, &cospi_p08_p24, &u2, &u3);
+  u4 = _mm256_add_epi16(t4, t5);
+  u5 = _mm256_sub_epi16(t4, t5);
+  u6 = _mm256_sub_epi16(t7, t6);
+  u7 = _mm256_add_epi16(t7, t6);
+
+  // stage 5, (0-7)
+  t0 = _mm256_add_epi16(u0, u3);
+  t1 = _mm256_add_epi16(u1, u2);
+  t2 = _mm256_sub_epi16(u1, u2);
+  t3 = _mm256_sub_epi16(u0, u3);
+  t4 = u4;
+  t7 = u7;
+  unpack_butter_fly(&u6, &u5, &cospi_p16_m16, &cospi_p16_p16, &t5, &t6);
+
+  // stage 6, (0-7)
+  u0 = _mm256_add_epi16(t0, t7);
+  u1 = _mm256_add_epi16(t1, t6);
+  u2 = _mm256_add_epi16(t2, t5);
+  u3 = _mm256_add_epi16(t3, t4);
+  u4 = _mm256_sub_epi16(t3, t4);
+  u5 = _mm256_sub_epi16(t2, t5);
+  u6 = _mm256_sub_epi16(t1, t6);
+  u7 = _mm256_sub_epi16(t0, t7);
+
+  // stage 1, (8-15)
+  v0 = in[1];
+  v1 = in[9];
+  v2 = in[5];
+  v3 = in[13];
+  v4 = in[3];
+  v5 = in[11];
+  v6 = in[7];
+  v7 = in[15];
+
+  // stage 2, (8-15)
+  unpack_butter_fly(&v0, &v7, &cospi_p30_m02, &cospi_p02_p30, &t0, &t7);
+  unpack_butter_fly(&v1, &v6, &cospi_p14_m18, &cospi_p18_p14, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_p22_m10, &cospi_p10_p22, &t2, &t5);
+  unpack_butter_fly(&v3, &v4, &cospi_p06_m26, &cospi_p26_p06, &t3, &t4);
+
+  // stage 3, (8-15)
+  v0 = _mm256_add_epi16(t0, t1);
+  v1 = _mm256_sub_epi16(t0, t1);
+  v2 = _mm256_sub_epi16(t3, t2);
+  v3 = _mm256_add_epi16(t2, t3);
+  v4 = _mm256_add_epi16(t4, t5);
+  v5 = _mm256_sub_epi16(t4, t5);
+  v6 = _mm256_sub_epi16(t7, t6);
+  v7 = _mm256_add_epi16(t6, t7);
+
+  // stage 4, (8-15)
+  t0 = v0;
+  t7 = v7;
+  t3 = v3;
+  t4 = v4;
+  unpack_butter_fly(&v1, &v6, &cospi_m08_p24, &cospi_p24_p08, &t1, &t6);
+  unpack_butter_fly(&v2, &v5, &cospi_m24_m08, &cospi_m08_p24, &t2, &t5);
+
+  // stage 5, (8-15)
+  v0 = _mm256_add_epi16(t0, t3);
+  v1 = _mm256_add_epi16(t1, t2);
+  v2 = _mm256_sub_epi16(t1, t2);
+  v3 = _mm256_sub_epi16(t0, t3);
+  v4 = _mm256_sub_epi16(t7, t4);
+  v5 = _mm256_sub_epi16(t6, t5);
+  v6 = _mm256_add_epi16(t6, t5);
+  v7 = _mm256_add_epi16(t7, t4);
+
+  // stage 6, (8-15)
+  t0 = v0;
+  t1 = v1;
+  t6 = v6;
+  t7 = v7;
+  unpack_butter_fly(&v5, &v2, &cospi_p16_m16, &cospi_p16_p16, &t2, &t5);
+  unpack_butter_fly(&v4, &v3, &cospi_p16_m16, &cospi_p16_p16, &t3, &t4);
+
+  // stage 7
+  in[0] = _mm256_add_epi16(u0, t7);
+  in[1] = _mm256_add_epi16(u1, t6);
+  in[2] = _mm256_add_epi16(u2, t5);
+  in[3] = _mm256_add_epi16(u3, t4);
+  in[4] = _mm256_add_epi16(u4, t3);
+  in[5] = _mm256_add_epi16(u5, t2);
+  in[6] = _mm256_add_epi16(u6, t1);
+  in[7] = _mm256_add_epi16(u7, t0);
+  in[8] = _mm256_sub_epi16(u7, t0);
+  in[9] = _mm256_sub_epi16(u6, t1);
+  in[10] = _mm256_sub_epi16(u5, t2);
+  in[11] = _mm256_sub_epi16(u4, t3);
+  in[12] = _mm256_sub_epi16(u3, t4);
+  in[13] = _mm256_sub_epi16(u2, t5);
+  in[14] = _mm256_sub_epi16(u1, t6);
+  in[15] = _mm256_sub_epi16(u0, t7);
+}
+
+static void idct16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  idct16_avx2(in);
+}
+
+static INLINE void butterfly_32b(const __m256i *a0, const __m256i *a1,
+                                 const __m256i *c0, const __m256i *c1,
+                                 __m256i *b) {
+  __m256i x0, x1;
+  x0 = _mm256_unpacklo_epi16(*a0, *a1);
+  x1 = _mm256_unpackhi_epi16(*a0, *a1);
+  b[0] = _mm256_madd_epi16(x0, *c0);
+  b[1] = _mm256_madd_epi16(x1, *c0);
+  b[2] = _mm256_madd_epi16(x0, *c1);
+  b[3] = _mm256_madd_epi16(x1, *c1);
+}
+
+static INLINE void group_rounding(__m256i *a, int num) {
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  int i;
+  for (i = 0; i < num; ++i) {
+    a[i] = _mm256_add_epi32(a[i], dct_rounding);
+    a[i] = _mm256_srai_epi32(a[i], DCT_CONST_BITS);
+  }
+}
+
+static INLINE void add_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_add_epi32(a[0], b[0]);
+  x[1] = _mm256_add_epi32(a[1], b[1]);
+  x[2] = _mm256_add_epi32(a[2], b[2]);
+  x[3] = _mm256_add_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void sub_rnd(const __m256i *a, const __m256i *b, __m256i *out) {
+  __m256i x[4];
+  x[0] = _mm256_sub_epi32(a[0], b[0]);
+  x[1] = _mm256_sub_epi32(a[1], b[1]);
+  x[2] = _mm256_sub_epi32(a[2], b[2]);
+  x[3] = _mm256_sub_epi32(a[3], b[3]);
+
+  group_rounding(x, 4);
+
+  out[0] = _mm256_packs_epi32(x[0], x[1]);
+  out[1] = _mm256_packs_epi32(x[2], x[3]);
+}
+
+static INLINE void butterfly_rnd(__m256i *a, __m256i *out) {
+  group_rounding(a, 4);
+  out[0] = _mm256_packs_epi32(a[0], a[1]);
+  out[1] = _mm256_packs_epi32(a[2], a[3]);
+}
+
+static void iadst16_avx2(__m256i *in) {
+  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i x[16], s[16];
+  __m256i u[4], v[4];
+
+  // stage 1
+  butterfly_32b(&in[15], &in[0], &cospi_p01_p31, &cospi_p31_m01, u);
+  butterfly_32b(&in[7], &in[8], &cospi_p17_p15, &cospi_p15_m17, v);
+  add_rnd(u, v, &x[0]);
+  sub_rnd(u, v, &x[8]);
+
+  butterfly_32b(&in[13], &in[2], &cospi_p05_p27, &cospi_p27_m05, u);
+  butterfly_32b(&in[5], &in[10], &cospi_p21_p11, &cospi_p11_m21, v);
+  add_rnd(u, v, &x[2]);
+  sub_rnd(u, v, &x[10]);
+
+  butterfly_32b(&in[11], &in[4], &cospi_p09_p23, &cospi_p23_m09, u);
+  butterfly_32b(&in[3], &in[12], &cospi_p25_p07, &cospi_p07_m25, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[12]);
+
+  butterfly_32b(&in[9], &in[6], &cospi_p13_p19, &cospi_p19_m13, u);
+  butterfly_32b(&in[1], &in[14], &cospi_p29_p03, &cospi_p03_m29, v);
+  add_rnd(u, v, &x[6]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 2
+  s[0] = _mm256_add_epi16(x[0], x[4]);
+  s[1] = _mm256_add_epi16(x[1], x[5]);
+  s[2] = _mm256_add_epi16(x[2], x[6]);
+  s[3] = _mm256_add_epi16(x[3], x[7]);
+  s[4] = _mm256_sub_epi16(x[0], x[4]);
+  s[5] = _mm256_sub_epi16(x[1], x[5]);
+  s[6] = _mm256_sub_epi16(x[2], x[6]);
+  s[7] = _mm256_sub_epi16(x[3], x[7]);
+  butterfly_32b(&x[8], &x[9], &cospi_p04_p28, &cospi_p28_m04, u);
+  butterfly_32b(&x[12], &x[13], &cospi_m28_p04, &cospi_p04_p28, v);
+  add_rnd(u, v, &s[8]);
+  sub_rnd(u, v, &s[12]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p20_p12, &cospi_p12_m20, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m12_p20, &cospi_p20_p12, v);
+  add_rnd(u, v, &s[10]);
+  sub_rnd(u, v, &s[14]);
+
+  // stage 3
+  x[0] = _mm256_add_epi16(s[0], s[2]);
+  x[1] = _mm256_add_epi16(s[1], s[3]);
+  x[2] = _mm256_sub_epi16(s[0], s[2]);
+  x[3] = _mm256_sub_epi16(s[1], s[3]);
+
+  x[8] = _mm256_add_epi16(s[8], s[10]);
+  x[9] = _mm256_add_epi16(s[9], s[11]);
+  x[10] = _mm256_sub_epi16(s[8], s[10]);
+  x[11] = _mm256_sub_epi16(s[9], s[11]);
+
+  butterfly_32b(&s[4], &s[5], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[6], &s[7], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[4]);
+  sub_rnd(u, v, &x[6]);
+
+  butterfly_32b(&s[12], &s[13], &cospi_p08_p24, &cospi_p24_m08, u);
+  butterfly_32b(&s[14], &s[15], &cospi_m24_p08, &cospi_p08_p24, v);
+  add_rnd(u, v, &x[12]);
+  sub_rnd(u, v, &x[14]);
+
+  // stage 4
+  butterfly_32b(&x[2], &x[3], &cospi_m16_m16, &cospi_p16_m16, u);
+  butterfly_32b(&x[6], &x[7], &cospi_p16_p16, &cospi_m16_p16, v);
+  butterfly_rnd(u, &x[2]);
+  butterfly_rnd(v, &x[6]);
+
+  butterfly_32b(&x[10], &x[11], &cospi_p16_p16, &cospi_m16_p16, u);
+  butterfly_32b(&x[14], &x[15], &cospi_m16_m16, &cospi_p16_m16, v);
+  butterfly_rnd(u, &x[10]);
+  butterfly_rnd(v, &x[14]);
+
+  in[0] = x[0];
+  in[1] = _mm256_sub_epi16(zero, x[8]);
+  in[2] = x[12];
+  in[3] = _mm256_sub_epi16(zero, x[4]);
+  in[4] = x[6];
+  in[5] = x[14];
+  in[6] = x[10];
+  in[7] = x[2];
+  in[8] = x[3];
+  in[9] = x[11];
+  in[10] = x[15];
+  in[11] = x[7];
+  in[12] = x[5];
+  in[13] = _mm256_sub_epi16(zero, x[13]);
+  in[14] = x[9];
+  in[15] = _mm256_sub_epi16(zero, x[1]);
+}
+
+static void iadst16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  iadst16_avx2(in);
+}
+
+#if CONFIG_EXT_TX
+static void flip_row(__m256i *in, int rows) {
+  int i;
+  for (i = 0; i < rows; ++i) {
+    mm256_reverse_epi16(&in[i]);
+  }
+}
+
+static void flip_col(uint8_t **dest, int *stride, int rows) {
+  *dest = *dest + (rows - 1) * (*stride);
+  *stride = -*stride;
+}
+
+static void iidtx16(__m256i *in) {
+  mm256_transpose_16x16(in);
+  txfm_scaling16_avx2(Sqrt2, in);
+}
+#endif
+
+void av1_iht16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m256i in[16];
+
+  load_buffer_16x16(input, in);
+  switch (tx_type) {
+    case DCT_DCT:
+      idct16(in);
+      idct16(in);
+      break;
+    case ADST_DCT:
+      idct16(in);
+      iadst16(in);
+      break;
+    case DCT_ADST:
+      iadst16(in);
+      idct16(in);
+      break;
+    case ADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16(in);
+      idct16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      flip_col(&dest, &stride, 16);
+      break;
+    case ADST_FLIPADST:
+      iadst16(in);
+      iadst16(in);
+      flip_row(in, 16);
+      break;
+    case FLIPADST_ADST:
+      iadst16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case V_DCT:
+      iidtx16(in);
+      idct16(in);
+      break;
+    case H_DCT:
+      idct16(in);
+      iidtx16(in);
+      break;
+    case V_ADST:
+      iidtx16(in);
+      iadst16(in);
+      break;
+    case H_ADST:
+      iadst16(in);
+      iidtx16(in);
+      break;
+    case V_FLIPADST:
+      iidtx16(in);
+      iadst16(in);
+      flip_col(&dest, &stride, 16);
+      break;
+    case H_FLIPADST:
+      iadst16(in);
+      iidtx16(in);
+      flip_row(in, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  write_buffer_16x16(in, stride, dest);
+}

diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 7f0f367..a6b6e1e 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c

@@ -9,9 +9,54 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "./av1_rtcd.h"
 #include "aom_dsp/x86/inv_txfm_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "aom_ports/mem.h"
+#include "av1/common/enums.h"
+
+#if CONFIG_EXT_TX
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) \
+  do {                         \
+    __m128i *tmp;              \
+    fliplr_16x8(in0);          \
+    fliplr_16x8(in1);          \
+    tmp = (in0);               \
+    (in0) = (in1);             \
+    (in1) = tmp;               \
+  } while (0)
+
+#define FLIPUD_PTR(dest, stride, size)       \
+  do {                                       \
+    (dest) = (dest) + ((size)-1) * (stride); \
+    (stride) = -(stride);                    \
+  } while (0)
+#endif
 
 void av1_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
@@ -23,22 +68,50 @@
   in[1] = load_input_data(input + 8);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       aom_idct4_sse2(in);
       aom_idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       aom_idct4_sse2(in);
       aom_iadst4_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       aom_iadst4_sse2(in);
       aom_idct4_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       aom_iadst4_sse2(in);
       aom_iadst4_sse2(in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst4_sse2(in);
+      aom_iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 
@@ -51,12 +124,12 @@
 
   // Reconstruction and Store
   {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
     d0 = _mm_unpacklo_epi8(d0, zero);
     d2 = _mm_unpacklo_epi8(d2, zero);
     d0 = _mm_add_epi16(d0, in[0]);
@@ -93,22 +166,50 @@
   in[7] = load_input_data(input + 8 * 7);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       aom_idct8_sse2(in);
       aom_idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       aom_idct8_sse2(in);
       aom_iadst8_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       aom_iadst8_sse2(in);
       aom_idct8_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       aom_iadst8_sse2(in);
       aom_iadst8_sse2(in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_idct8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 
@@ -141,31 +242,285 @@
   RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
 
+#if CONFIG_EXT_TX
+static void iidtx16_8col(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+  in[8] = _mm_slli_epi16(in[8], 1);
+  in[9] = _mm_slli_epi16(in[9], 1);
+  in[10] = _mm_slli_epi16(in[10], 1);
+  in[11] = _mm_slli_epi16(in[11], 1);
+  in[12] = _mm_slli_epi16(in[12], 1);
+  in[13] = _mm_slli_epi16(in[13], 1);
+  in[14] = _mm_slli_epi16(in[14], 1);
+  in[15] = _mm_slli_epi16(in[15], 1);
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
+  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
+  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
+  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
+
+  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
+  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
+  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
+  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
+  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
+  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
+  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
+  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
+
+  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
+  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
+  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
+  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
+  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
+  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
+  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
+  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
+
+  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
+  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
+  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
+  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
+  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
+  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
+  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
+  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
+
+  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
+  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
+  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
+  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
+
+  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
+  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
+  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
+  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
+  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
+  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
+  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
+  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
+
+  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
+  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
+  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
+  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
+  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
+
+  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
+  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
+  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
+  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
+  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
+  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
+  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
+  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
+  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
+  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
+  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(v0, x0);
+  in[1] = _mm_packs_epi32(v1, x1);
+  in[2] = _mm_packs_epi32(v2, x2);
+  in[3] = _mm_packs_epi32(v3, x3);
+  in[4] = _mm_packs_epi32(v4, x4);
+  in[5] = _mm_packs_epi32(v5, x5);
+  in[6] = _mm_packs_epi32(v6, x6);
+  in[7] = _mm_packs_epi32(v7, x7);
+
+  in[8] = _mm_packs_epi32(u0, y0);
+  in[9] = _mm_packs_epi32(u1, y1);
+  in[10] = _mm_packs_epi32(u2, y2);
+  in[11] = _mm_packs_epi32(u3, y3);
+  in[12] = _mm_packs_epi32(u4, y4);
+  in[13] = _mm_packs_epi32(u5, y5);
+  in[14] = _mm_packs_epi32(u6, y6);
+  in[15] = _mm_packs_epi32(u7, y7);
+}
+
+static void iidtx16_sse2(__m128i *in0, __m128i *in1) {
+  array_transpose_16x16(in0, in1);
+  iidtx16_8col(in0);
+  iidtx16_8col(in1);
+}
+#endif
+
 void av1_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                int stride, int tx_type) {
-  __m128i in0[16], in1[16];
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
 
   load_buffer_8x16(input, in0);
   input += 8;
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       aom_idct16_sse2(in0, in1);
       aom_idct16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       aom_idct16_sse2(in0, in1);
       aom_iadst16_sse2(in0, in1);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       aom_iadst16_sse2(in0, in1);
       aom_idct16_sse2(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       aom_iadst16_sse2(in0, in1);
       aom_iadst16_sse2(in0, in1);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      aom_idct16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case V_DCT:
+      iidtx16_sse2(in0, in1);
+      aom_idct16_sse2(in0, in1);
+      break;
+    case H_DCT:
+      aom_idct16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_ADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      break;
+    case H_ADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      break;
+    case V_FLIPADST:
+      iidtx16_sse2(in0, in1);
+      aom_iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case H_FLIPADST:
+      aom_iadst16_sse2(in0, in1);
+      iidtx16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 
@@ -173,3 +528,1156 @@
   dest += 8;
   write_buffer_8x16(dest, in1, stride);
 }
+
+#if CONFIG_EXT_TX
+static void iidtx8_sse2(__m128i *in) {
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+}
+
+static INLINE void iidtx4_sse2(__m128i *in) {
+  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+
+  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+
+  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+}
+
+// load 8x8 array
+static INLINE void flip_buffer_lr_8x8(__m128i *in) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void scale_sqrt2_8x4(__m128i *in) {
+  // Implements 'ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS)'
+  // for each element
+  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
+  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
+  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
+  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
+
+  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
+  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
+
+  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
+  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
+}
+
+static INLINE void scale_sqrt2_8x8(__m128i *in) {
+  // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
+  // for each element
+  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
+  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
+  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
+  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
+  const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
+  const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
+  const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
+  const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
+  const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
+  const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
+  const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
+  const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
+
+  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
+  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
+  const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
+  const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
+  const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
+  const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
+  const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
+  const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
+  const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
+  const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
+
+  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
+  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
+  in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
+  in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
+  in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
+  in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
+}
+
+static INLINE void scale_sqrt2_8x16(__m128i *in) {
+  scale_sqrt2_8x8(in);
+  scale_sqrt2_8x8(in + 8);
+}
+
+void av1_iht8x16_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  __m128i in[16];
+
+  in[0] = load_input_data(input + 0 * 8);
+  in[1] = load_input_data(input + 1 * 8);
+  in[2] = load_input_data(input + 2 * 8);
+  in[3] = load_input_data(input + 3 * 8);
+  in[4] = load_input_data(input + 4 * 8);
+  in[5] = load_input_data(input + 5 * 8);
+  in[6] = load_input_data(input + 6 * 8);
+  in[7] = load_input_data(input + 7 * 8);
+
+  in[8] = load_input_data(input + 8 * 8);
+  in[9] = load_input_data(input + 9 * 8);
+  in[10] = load_input_data(input + 10 * 8);
+  in[11] = load_input_data(input + 11 * 8);
+  in[12] = load_input_data(input + 12 * 8);
+  in[13] = load_input_data(input + 13 * 8);
+  in[14] = load_input_data(input + 14 * 8);
+  in[15] = load_input_data(input + 15 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      array_transpose_8x8(in, in);
+      aom_idct8_sse2(in + 8);
+      array_transpose_8x8(in + 8, in + 8);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      array_transpose_8x8(in, in);
+      aom_iadst8_sse2(in + 8);
+      array_transpose_8x8(in + 8, in + 8);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx8_sse2(in);
+      iidtx8_sse2(in + 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  scale_sqrt2_8x8(in);
+  scale_sqrt2_8x8(in + 8);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      idct16_8col(in);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX: iidtx16_8col(in); break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case H_DCT:
+#endif
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      write_buffer_8x16(dest, in, stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: write_buffer_8x16(dest + stride * 15, in, -stride); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x16(dest, in, stride);
+      break;
+    case FLIPADST_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x16(dest + stride * 15, in, -stride);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+static INLINE void write_buffer_8x8_round6(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
+}
+
+void av1_iht16x8_128_add_sse2(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  __m128i in[16];
+
+  // Transpose 16x8 input into in[]
+  in[0] = load_input_data(input + 0 * 16);
+  in[1] = load_input_data(input + 1 * 16);
+  in[2] = load_input_data(input + 2 * 16);
+  in[3] = load_input_data(input + 3 * 16);
+  in[4] = load_input_data(input + 4 * 16);
+  in[5] = load_input_data(input + 5 * 16);
+  in[6] = load_input_data(input + 6 * 16);
+  in[7] = load_input_data(input + 7 * 16);
+  array_transpose_8x8(in, in);
+
+  in[8] = load_input_data(input + 8 + 0 * 16);
+  in[9] = load_input_data(input + 8 + 1 * 16);
+  in[10] = load_input_data(input + 8 + 2 * 16);
+  in[11] = load_input_data(input + 8 + 3 * 16);
+  in[12] = load_input_data(input + 8 + 4 * 16);
+  in[13] = load_input_data(input + 8 + 5 * 16);
+  in[14] = load_input_data(input + 8 + 6 * 16);
+  in[15] = load_input_data(input + 8 + 7 * 16);
+  array_transpose_8x8(in + 8, in + 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      idct16_8col(in);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      iadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: iidtx16_8col(in); break;
+#endif
+    default: assert(0); break;
+  }
+
+  // Scale
+  scale_sqrt2_8x8(in);
+  scale_sqrt2_8x8(in + 8);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      aom_idct8_sse2(in + 8);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      aom_iadst8_sse2(in + 8);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in + 8, in + 8);
+      iidtx8_sse2(in);
+      iidtx8_sse2(in + 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      write_buffer_8x8_round6(dest, in, stride);
+      write_buffer_8x8_round6(dest + 8, in + 8, stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      write_buffer_8x8_round6(dest + stride * 7, in, -stride);
+      write_buffer_8x8_round6(dest + stride * 7 + 8, in + 8, -stride);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x8_round6(dest, in + 8, stride);
+      write_buffer_8x8_round6(dest + 8, in, stride);
+      break;
+    case FLIPADST_FLIPADST:
+      flip_buffer_lr_8x8(in);
+      flip_buffer_lr_8x8(in + 8);
+      write_buffer_8x8_round6(dest + stride * 7, in + 8, -stride);
+      write_buffer_8x8_round6(dest + stride * 7 + 8, in, -stride);
+      break;
+#endif
+    default: assert(0); break;
+  }
+}
+
+static INLINE void write_buffer_8x4_round5(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+}
+
+void av1_iht8x4_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+
+  in[0] = load_input_data(input + 0 * 8);
+  in[1] = load_input_data(input + 1 * 8);
+  in[2] = load_input_data(input + 2 * 8);
+  in[3] = load_input_data(input + 3 * 8);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST: aom_iadst8_sse2(in); break;
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: iidtx8_sse2(in); array_transpose_8x8(in, in);
+#endif
+      break;
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x8(in);
+
+  // Repack data. We pack into the bottom half of 'in'
+  // so that the next repacking stage can pack into the
+  // top half without overwriting anything
+  in[7] = _mm_unpacklo_epi64(in[6], in[7]);
+  in[6] = _mm_unpacklo_epi64(in[4], in[5]);
+  in[5] = _mm_unpacklo_epi64(in[2], in[3]);
+  in[4] = _mm_unpacklo_epi64(in[0], in[1]);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx4_sse2(in + 4);
+      array_transpose_4x4(in + 4);
+      iidtx4_sse2(in + 6);
+      array_transpose_4x4(in + 6);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  // Repack data
+  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 4); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      in[0] = mm_reverse_epi16(in[0]);
+      in[1] = mm_reverse_epi16(in[1]);
+      in[2] = mm_reverse_epi16(in[2]);
+      in[3] = mm_reverse_epi16(in[3]);
+      break;
+    case FLIPADST_FLIPADST:
+      in[0] = mm_reverse_epi16(in[0]);
+      in[1] = mm_reverse_epi16(in[1]);
+      in[2] = mm_reverse_epi16(in[2]);
+      in[3] = mm_reverse_epi16(in[3]);
+      FLIPUD_PTR(dest, stride, 4);
+#endif
+      break;
+    default: assert(0); break;
+  }
+  write_buffer_8x4_round5(dest, in, stride);
+}
+
+static INLINE void write_buffer_4x8_round5(uint8_t *dest, __m128i *in,
+                                           int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+  const __m128i zero = _mm_setzero_si128();
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 5);
+  in[1] = _mm_srai_epi16(in[1], 5);
+  in[2] = _mm_srai_epi16(in[2], 5);
+  in[3] = _mm_srai_epi16(in[3], 5);
+
+  // Reconstruction and Store
+  {
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    __m128i d4 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 4));
+    __m128i d5 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 5));
+    __m128i d6 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 6));
+    __m128i d7 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 7));
+
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
+    d4 = _mm_unpacklo_epi32(d4, d5);
+    d6 = _mm_unpacklo_epi32(d6, d7);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d4 = _mm_unpacklo_epi8(d4, zero);
+    d6 = _mm_unpacklo_epi8(d6, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d4 = _mm_add_epi16(d4, in[2]);
+    d6 = _mm_add_epi16(d6, in[3]);
+
+    d0 = _mm_packus_epi16(d0, d2);
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_packus_epi16(d4, d6);
+    *(int *)(dest + stride * 4) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 5) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 6) = _mm_cvtsi128_si32(d0);
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 7) = _mm_cvtsi128_si32(d0);
+  }
+}
+
+void av1_iht4x8_32_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
+                            int tx_type) {
+  __m128i in[8];
+
+  // Load rows, packed two per element of 'in'.
+  // We pack into the bottom half of 'in' so that the
+  // later repacking stage can pack into the
+  // top half without overwriting anything
+  in[4] = load_input_data(input + 0 * 8);
+  in[5] = load_input_data(input + 1 * 8);
+  in[6] = load_input_data(input + 2 * 8);
+  in[7] = load_input_data(input + 3 * 8);
+
+  scale_sqrt2_8x4(in + 4);
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct4_sse2(in + 4);
+      aom_idct4_sse2(in + 6);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst4_sse2(in + 4);
+      aom_iadst4_sse2(in + 6);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx4_sse2(in + 4);
+      array_transpose_4x4(in + 4);
+      iidtx4_sse2(in + 6);
+      array_transpose_4x4(in + 6);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  // Repack data
+  in[0] = _mm_unpacklo_epi64(in[4], in[6]);
+  in[1] = _mm_unpackhi_epi64(in[4], in[6]);
+  in[2] = _mm_unpacklo_epi64(in[5], in[7]);
+  in[3] = _mm_unpackhi_epi64(in[5], in[7]);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct8_sse2(in);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx8_sse2(in);
+      array_transpose_8x8(in, in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 8); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+      break;
+    case FLIPADST_FLIPADST:
+      in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+      in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+      in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+      in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+      in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+      in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+      in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+      in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  in[0] = _mm_unpacklo_epi64(in[0], in[1]);
+  in[1] = _mm_unpacklo_epi64(in[2], in[3]);
+  in[2] = _mm_unpacklo_epi64(in[4], in[5]);
+  in[3] = _mm_unpacklo_epi64(in[6], in[7]);
+  write_buffer_4x8_round5(dest, in, stride);
+}
+
+// Note: The 16-column 32-element transforms take input in the form of four
+// 8x16 blocks (each stored as a __m128i[16]), which are the four quadrants
+// of the overall 16x32 input buffer.
+static INLINE void idct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                __m128i *br) {
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+  idct32_8col(tl, bl);
+  idct32_8col(tr, br);
+}
+
+static INLINE void ihalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                      __m128i *br) {
+  __m128i tmpl[16], tmpr[16];
+  int i;
+
+  // Copy the top half of the input to temporary storage
+  for (i = 0; i < 16; ++i) {
+    tmpl[i] = tl[i];
+    tmpr[i] = tr[i];
+  }
+
+  // Generate the top half of the output
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(bl[i], 2);
+    tr[i] = _mm_slli_epi16(br[i], 2);
+  }
+  array_transpose_16x16(tl, tr);
+
+  // Copy the temporary storage back to the bottom half of the input
+  for (i = 0; i < 16; ++i) {
+    bl[i] = tmpl[i];
+    br[i] = tmpr[i];
+  }
+
+  // Generate the bottom half of the output
+  scale_sqrt2_8x16(bl);
+  scale_sqrt2_8x16(br);
+  aom_idct16_sse2(bl, br);  // Includes a transposition
+}
+
+#if CONFIG_EXT_TX
+static INLINE void iidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                 __m128i *br) {
+  int i;
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(tl[i], 2);
+    tr[i] = _mm_slli_epi16(tr[i], 2);
+    bl[i] = _mm_slli_epi16(bl[i], 2);
+    br[i] = _mm_slli_epi16(br[i], 2);
+  }
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void write_buffer_16x32_round6(uint8_t *dest, __m128i *intl,
+                                             __m128i *intr, __m128i *inbl,
+                                             __m128i *inbr, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    intl[i] = _mm_adds_epi16(intl[i], final_rounding);
+    intr[i] = _mm_adds_epi16(intr[i], final_rounding);
+    inbl[i] = _mm_adds_epi16(inbl[i], final_rounding);
+    inbr[i] = _mm_adds_epi16(inbr[i], final_rounding);
+    intl[i] = _mm_srai_epi16(intl[i], 6);
+    intr[i] = _mm_srai_epi16(intr[i], 6);
+    inbl[i] = _mm_srai_epi16(inbl[i], 6);
+    inbr[i] = _mm_srai_epi16(inbr[i], 6);
+    RECON_AND_STORE(dest + i * stride + 0, intl[i]);
+    RECON_AND_STORE(dest + i * stride + 8, intr[i]);
+    RECON_AND_STORE(dest + (i + 16) * stride + 0, inbl[i]);
+    RECON_AND_STORE(dest + (i + 16) * stride + 8, inbr[i]);
+  }
+}
+
+void av1_iht16x32_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i intl[16], intr[16], inbl[16], inbr[16];
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    intl[i] = load_input_data(input + i * 16 + 0);
+    intr[i] = load_input_data(input + i * 16 + 8);
+    inbl[i] = load_input_data(input + (i + 16) * 16 + 0);
+    inbr[i] = load_input_data(input + (i + 16) * 16 + 8);
+  }
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      aom_idct16_sse2(intl, intr);
+      aom_idct16_sse2(inbl, inbr);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      aom_iadst16_sse2(intl, intr);
+      aom_iadst16_sse2(inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+      iidtx16_sse2(intl, intr);
+      iidtx16_sse2(inbl, inbr);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x16(intl);
+  scale_sqrt2_8x16(intr);
+  scale_sqrt2_8x16(inbl);
+  scale_sqrt2_8x16(inbr);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      idct32_16col(intl, intr, inbl, inbr);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      ihalfright32_16col(intl, intr, inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX: iidtx32_16col(intl, intr, inbl, inbr); break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 32); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp = intl[i];
+        intl[i] = mm_reverse_epi16(intr[i]);
+        intr[i] = mm_reverse_epi16(tmp);
+        tmp = inbl[i];
+        inbl[i] = mm_reverse_epi16(inbr[i]);
+        inbr[i] = mm_reverse_epi16(tmp);
+      }
+      break;
+    case FLIPADST_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp = intl[i];
+        intl[i] = mm_reverse_epi16(intr[i]);
+        intr[i] = mm_reverse_epi16(tmp);
+        tmp = inbl[i];
+        inbl[i] = mm_reverse_epi16(inbr[i]);
+        inbr[i] = mm_reverse_epi16(tmp);
+      }
+      FLIPUD_PTR(dest, stride, 32);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_16x32_round6(dest, intl, intr, inbl, inbr, stride);
+}
+
+static INLINE void write_buffer_32x16_round6(uint8_t *dest, __m128i *in0,
+                                             __m128i *in1, __m128i *in2,
+                                             __m128i *in3, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = _mm_adds_epi16(in0[i], final_rounding);
+    in1[i] = _mm_adds_epi16(in1[i], final_rounding);
+    in2[i] = _mm_adds_epi16(in2[i], final_rounding);
+    in3[i] = _mm_adds_epi16(in3[i], final_rounding);
+    in0[i] = _mm_srai_epi16(in0[i], 6);
+    in1[i] = _mm_srai_epi16(in1[i], 6);
+    in2[i] = _mm_srai_epi16(in2[i], 6);
+    in3[i] = _mm_srai_epi16(in3[i], 6);
+    RECON_AND_STORE(dest + i * stride + 0, in0[i]);
+    RECON_AND_STORE(dest + i * stride + 8, in1[i]);
+    RECON_AND_STORE(dest + i * stride + 16, in2[i]);
+    RECON_AND_STORE(dest + i * stride + 24, in3[i]);
+  }
+}
+
+void av1_iht32x16_512_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  __m128i in0[16], in1[16], in2[16], in3[16];
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = load_input_data(input + i * 32 + 0);
+    in1[i] = load_input_data(input + i * 32 + 8);
+    in2[i] = load_input_data(input + i * 32 + 16);
+    in3[i] = load_input_data(input + i * 32 + 24);
+  }
+
+  // Row transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case H_DCT:
+#endif
+      idct32_16col(in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case H_ADST:
+    case H_FLIPADST:
+#endif
+      ihalfright32_16col(in0, in1, in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case V_FLIPADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX: iidtx32_16col(in0, in1, in2, in3); break;
+#endif
+    default: assert(0); break;
+  }
+
+  scale_sqrt2_8x16(in0);
+  scale_sqrt2_8x16(in1);
+  scale_sqrt2_8x16(in2);
+  scale_sqrt2_8x16(in3);
+
+  // Column transform
+  switch (tx_type) {
+    case DCT_DCT:
+    case DCT_ADST:
+#if CONFIG_EXT_TX
+    case DCT_FLIPADST:
+    case V_DCT:
+#endif
+      aom_idct16_sse2(in0, in1);
+      aom_idct16_sse2(in2, in3);
+      break;
+    case ADST_DCT:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case FLIPADST_ADST:
+    case ADST_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case FLIPADST_DCT:
+    case V_ADST:
+    case V_FLIPADST:
+#endif
+      aom_iadst16_sse2(in0, in1);
+      aom_iadst16_sse2(in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+    case IDTX:
+      iidtx16_sse2(in0, in1);
+      iidtx16_sse2(in2, in3);
+      break;
+#endif
+    default: assert(0); break;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+#if CONFIG_EXT_TX
+    case H_DCT:
+    case H_ADST:
+    case V_ADST:
+    case V_DCT:
+    case IDTX:
+#endif
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST: FLIPUD_PTR(dest, stride, 16); break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp1 = in0[i];
+        __m128i tmp2 = in1[i];
+        in0[i] = mm_reverse_epi16(in3[i]);
+        in1[i] = mm_reverse_epi16(in2[i]);
+        in2[i] = mm_reverse_epi16(tmp2);
+        in3[i] = mm_reverse_epi16(tmp1);
+      }
+      break;
+    case FLIPADST_FLIPADST:
+      for (i = 0; i < 16; ++i) {
+        __m128i tmp1 = in0[i];
+        __m128i tmp2 = in1[i];
+        in0[i] = mm_reverse_epi16(in3[i]);
+        in1[i] = mm_reverse_epi16(in2[i]);
+        in2[i] = mm_reverse_epi16(tmp2);
+        in3[i] = mm_reverse_epi16(tmp1);
+      }
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_32x16_round6(dest, in0, in1, in2, in3, stride);
+}

diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 94f536d..fb240d7 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c

@@ -26,9 +26,6 @@
 #include "aom_ports/mem_ops.h"
 #include "aom_scale/aom_scale.h"
 #include "aom_util/aom_thread.h"
-#if CONFIG_BITSTREAM_DEBUG
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "av1/common/alloccommon.h"
 #if CONFIG_CLPF
@@ -81,7 +78,7 @@
 static int is_compound_reference_allowed(const AV1_COMMON *cm) {
   int i;
   if (frame_is_intra_only(cm)) return 0;
-  for (i = 1; i < REFS_PER_FRAME; ++i)
+  for (i = 1; i < INTER_REFS_PER_FRAME; ++i)
     if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
 
   return 0;
@@ -93,6 +90,7 @@
   cm->comp_fwd_ref[1] = LAST2_FRAME;
   cm->comp_fwd_ref[2] = LAST3_FRAME;
   cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+
   cm->comp_bwd_ref[0] = BWDREF_FRAME;
   cm->comp_bwd_ref[1] = ALTREF_FRAME;
 #else
@@ -127,20 +125,12 @@
   return aom_rb_read_bit(rb) ? TX_MODE_SELECT : aom_rb_read_literal(rb, 2);
 }
 
-static void read_tx_mode_probs(struct tx_probs *tx_probs, aom_reader *r) {
-  int i, j;
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = TX_4X4; j < TX_SIZES - 3; ++j)
-      av1_diff_update_prob(r, &tx_probs->p8x8[i][j], ACCT_STR);
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = TX_4X4; j < TX_SIZES - 2; ++j)
-      av1_diff_update_prob(r, &tx_probs->p16x16[i][j], ACCT_STR);
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = TX_4X4; j < TX_SIZES - 1; ++j)
-      av1_diff_update_prob(r, &tx_probs->p32x32[i][j], ACCT_STR);
+static void read_tx_size_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j, k;
+  for (i = 0; i < MAX_TX_DEPTH; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      for (k = 0; k < i + 1; ++k)
+        av1_diff_update_prob(r, &fc->tx_size_probs[i][j][k], ACCT_STR);
 }
 
 #if !CONFIG_EC_ADAPT
@@ -154,8 +144,8 @@
 #endif
 
 static void read_inter_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
-  int i;
 #if CONFIG_REF_MV
+  int i;
   for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
     av1_diff_update_prob(r, &fc->newmv_prob[i], ACCT_STR);
   for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
@@ -164,18 +154,37 @@
     av1_diff_update_prob(r, &fc->refmv_prob[i], ACCT_STR);
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
     av1_diff_update_prob(r, &fc->drl_prob[i], ACCT_STR);
+#if CONFIG_EXT_INTER
+  av1_diff_update_prob(r, &fc->new2mv_prob, ACCT_STR);
+#endif  // CONFIG_EXT_INTER
 #else
-  int j;
 #if !CONFIG_EC_ADAPT
+  int i, j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
     for (j = 0; j < INTER_MODES - 1; ++j)
       av1_diff_update_prob(r, &fc->inter_mode_probs[i][j], ACCT_STR);
   }
+#else
+  (void)fc;
+  (void)r;
 #endif
 #endif
 }
 
 #if !CONFIG_EC_ADAPT
+#if CONFIG_EXT_INTER
+static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j;
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (j = 0; j < INTER_MODE_CONTEXTS; ++j) {
+      for (i = 0; i < INTER_COMPOUND_MODES - 1; ++i) {
+        av1_diff_update_prob(r, &fc->inter_compound_mode_probs[j][i], ACCT_STR);
+      }
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
+#if !CONFIG_EXT_TX
 static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
   int i, j, k;
   if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
@@ -194,6 +203,7 @@
   }
 }
 #endif
+#endif
 
 static REFERENCE_MODE read_frame_reference_mode(
     const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
@@ -214,22 +224,27 @@
     for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
       av1_diff_update_prob(r, &fc->comp_inter_prob[i], ACCT_STR);
 
-  if (cm->reference_mode != COMPOUND_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; ++i)
-      for (j = 0; j < (SINGLE_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->single_ref_prob[i][j], ACCT_STR);
-
-  if (cm->reference_mode != SINGLE_REFERENCE)
-#if CONFIG_EXT_REFS
+  if (cm->reference_mode != COMPOUND_REFERENCE) {
     for (i = 0; i < REF_CONTEXTS; ++i) {
-      for (j = 0; j < (FWD_REFS - 1); ++j)
-        av1_diff_update_prob(r, &fc->comp_fwdref_prob[i][j], ACCT_STR);
-      for (j = 0; j < (BWD_REFS - 1); ++j) (r, &fc->comp_bwdref_prob[i][j]);
+      for (j = 0; j < (SINGLE_REFS - 1); ++j) {
+        av1_diff_update_prob(r, &fc->single_ref_prob[i][j], ACCT_STR);
+      }
     }
+  }
+
+  if (cm->reference_mode != SINGLE_REFERENCE) {
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+#if CONFIG_EXT_REFS
+      for (j = 0; j < (FWD_REFS - 1); ++j)
+        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
+      for (j = 0; j < (BWD_REFS - 1); ++j)
+        av1_diff_update_prob(r, &fc->comp_bwdref_prob[i][j], ACCT_STR);
 #else
-    for (i = 0; i < REF_CONTEXTS; ++i)
-      av1_diff_update_prob(r, &fc->comp_ref_prob[i], ACCT_STR);
+      for (j = 0; j < (COMP_REFS - 1); ++j)
+        av1_diff_update_prob(r, &fc->comp_ref_prob[i][j], ACCT_STR);
 #endif  // CONFIG_EXT_REFS
+    }
+  }
 }
 
 static void update_mv_probs(aom_prob *p, int n, aom_reader *r) {
@@ -272,69 +287,26 @@
 static void inverse_transform_block(MACROBLOCKD *xd, int plane,
                                     const TX_TYPE tx_type,
                                     const TX_SIZE tx_size, uint8_t *dst,
-                                    int stride, int eob) {
+                                    int stride, int16_t scan_line, int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int seg_id = xd->mi[0]->mbmi.segment_id;
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
-#if CONFIG_AOM_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      switch (tx_size) {
-        case TX_4X4:
-          av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
-                                      tx_type, xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          av1_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
-                                      tx_type);
-          break;
-        case TX_16X16:
-          av1_highbd_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, xd->bd,
-                                        tx_type);
-          break;
-        case TX_32X32:
-          av1_highbd_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, xd->bd,
-                                        tx_type);
-          break;
-        default: assert(0 && "Invalid transform size"); return;
-      }
-    } else {
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-      switch (tx_size) {
-        case TX_4X4:
-          av1_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
-                               xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          av1_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_16X16:
-          av1_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_32X32:
-          av1_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        default: assert(0 && "Invalid transform size"); return;
-      }
-#if CONFIG_AOM_HIGHBITDEPTH
-    }
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+  tran_low_t *const dqcoeff = pd->dqcoeff;
+  INV_TXFM_PARAM inv_txfm_param;
+  inv_txfm_param.tx_type = tx_type;
+  inv_txfm_param.tx_size = tx_size;
+  inv_txfm_param.eob = eob;
+  inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
 
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-#if CONFIG_ADAPT_SCAN
-      memset(dqcoeff, 0, tx_size_2d[tx_size] * sizeof(dqcoeff[0]));
-#else   // CONFIG_ADAPT_SCAN
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * tx_size_1d[tx_size] * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, tx_size_2d[tx_size] * sizeof(dqcoeff[0]));
-#endif  // CONFIG_ADAPT_SCAN
-    }
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    inv_txfm_param.bd = xd->bd;
+    highbd_inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
+  } else {
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
+#if CONFIG_AOM_HIGHBITDEPTH
   }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
 }
 
 #if CONFIG_PVQ
@@ -350,7 +322,7 @@
   // DC quantizer for PVQ
   int pvq_dc_quant;
   int lossless = (quant[0] == 0);
-  const int blk_size = tx_size_1d[bs];
+  const int blk_size = tx_size_wide[bs];
   int eob = 0;
   int i;
   // TODO(yushin) : To enable activity masking,
@@ -411,7 +383,7 @@
                                   int col, TX_SIZE tx_size, TX_TYPE tx_type) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   // transform block size in pixels
-  int tx_blk_size = tx_size_1d[tx_size];
+  int tx_blk_size = tx_size_wide[tx_size];
   int i, j;
   tran_low_t *pvq_ref_coeff = pd->pvq_ref_coeff;
   const int diff_stride = tx_blk_size;
@@ -438,6 +410,8 @@
     int seg_id = mbmi->segment_id;
     int16_t *quant;
     FWD_TXFM_PARAM fwd_txfm_param;
+    // ToDo(yaowu): correct this with optimal number from decoding process.
+    const int max_scan_line = tx_size_2d[tx_size];
 
     for (j = 0; j < tx_blk_size; j++)
       for (i = 0; i < tx_blk_size; i++) {
@@ -464,16 +438,23 @@
       for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
 
     inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
-                            eob);
+                            max_scan_line, eob);
   }
 
   return eob;
 }
 #endif
 
-static void predict_and_reconstruct_intra_block(
-    AV1_COMMON *cm, MACROBLOCKD *const xd, aom_reader *r,
-    MB_MODE_INFO *const mbmi, int plane, int row, int col, TX_SIZE tx_size) {
+static void predict_and_reconstruct_intra_block(AV1_COMMON *cm,
+                                                MACROBLOCKD *const xd,
+#if CONFIG_ANS
+                                                struct AnsDecoder *const r,
+#else
+                                                aom_reader *r,
+#endif  // CONFIG_ANS
+                                                MB_MODE_INFO *const mbmi,
+                                                int plane, int row, int col,
+                                                TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   PREDICTION_MODE mode = (plane == 0) ? mbmi->mode : mbmi->uv_mode;
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
@@ -488,55 +469,123 @@
   if (mbmi->sb_type < BLOCK_8X8)
     if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
 
-  av1_predict_intra_block(xd, pd->n4_wl, pd->n4_hl, tx_size, mode, dst,
+  av1_predict_intra_block(xd, pd->width, pd->height, tx_size, mode, dst,
                           pd->dst.stride, dst, pd->dst.stride, col, row, plane);
 
   if (!mbmi->skip) {
-    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
 #if !CONFIG_PVQ
-    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type);
-    const int eob = av1_decode_block_tokens(xd, plane, scan_order, col, row,
-                                            tx_size, r, mbmi->segment_id);
+    const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 0);
+    int16_t max_scan_line = 0;
+    const int eob =
+        av1_decode_block_tokens(xd, plane, scan_order, col, row, tx_size,
+                                tx_type, &max_scan_line, r, mbmi->segment_id);
 #if CONFIG_ADAPT_SCAN
     av1_update_scan_count_facade(cm, tx_size, tx_type, pd->dqcoeff, eob);
 #endif
-    inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
-                            eob);
+    if (eob)
+      inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+                              max_scan_line, eob);
 #else
     av1_pvq_decode_helper2(xd, mbmi, plane, row, col, tx_size, tx_type);
 #endif
   }
 }
 
+#if CONFIG_VAR_TX
+static void decode_reconstruct_tx(AV1_COMMON *cm, MACROBLOCKD *const xd,
+                                  aom_reader *r, MB_MODE_INFO *const mbmi,
+                                  int plane, BLOCK_SIZE plane_bsize,
+                                  int blk_row, int blk_col, TX_SIZE tx_size,
+                                  int *eob_total) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+  // Scale to match transform block unit.
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+    int block_idx = (blk_row << 1) + blk_col;
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, plane_tx_size);
+    const SCAN_ORDER *sc = get_scan(cm, plane_tx_size, tx_type, 1);
+    int16_t max_scan_line = 0;
+    const int eob =
+        av1_decode_block_tokens(xd, plane, sc, blk_col, blk_row, plane_tx_size,
+                                tx_type, &max_scan_line, r, mbmi->segment_id);
+    inverse_transform_block(
+        xd, plane, tx_type, plane_tx_size,
+        &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col],
+        pd->dst.stride, max_scan_line, eob);
+    *eob_total += eob;
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, offsetr,
+                            offsetc, sub_txs, eob_total);
+    }
+  }
+}
+#endif  // CONFIG_VAR_TX
+
+#if !CONFIG_VAR_TX || CONFIG_SUPERTX || (CONFIG_EXT_TX && CONFIG_RECT_TX)
 static int reconstruct_inter_block(AV1_COMMON *cm, MACROBLOCKD *const xd,
-                                   aom_reader *r, MB_MODE_INFO *const mbmi,
-                                   int plane, int row, int col,
+#if CONFIG_ANS
+                                   struct AnsDecoder *const r,
+#else
+                                   aom_reader *r,
+#endif
+                                   int segment_id, int plane, int row, int col,
                                    TX_SIZE tx_size) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   int block_idx = (row << 1) + col;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
 #if CONFIG_PVQ
   int eob;
   (void)cm;
   (void)r;
+  (void)segment_id;
+#else
+  struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif
 
 #if !CONFIG_PVQ
-  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type);
-  const int eob = av1_decode_block_tokens(xd, plane, scan_order, col, row,
-                                          tx_size, r, mbmi->segment_id);
+  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
+  int16_t max_scan_line = 0;
+  const int eob =
+      av1_decode_block_tokens(xd, plane, scan_order, col, row, tx_size, tx_type,
+                              &max_scan_line, r, segment_id);
 #if CONFIG_ADAPT_SCAN
   av1_update_scan_count_facade(cm, tx_size, tx_type, pd->dqcoeff, eob);
 #endif
-  inverse_transform_block(xd, plane, tx_type, tx_size,
-                          &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
-                          pd->dst.stride, eob);
+  if (eob)
+    inverse_transform_block(xd, plane, tx_type, tx_size,
+                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                            pd->dst.stride, max_scan_line, eob);
 #else
-  eob = av1_pvq_decode_helper2(xd, mbmi, plane, row, col, tx_size, tx_type);
+  eob = av1_pvq_decode_helper2(xd, &xd->mi[0]->mbmi, plane, row, col, tx_size,
+                               tx_type);
 #endif
   return eob;
 }
+#endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
 
 static INLINE void dec_reset_skip_context(MACROBLOCKD *xd) {
   int i;
@@ -560,12 +609,20 @@
   // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
   // passing bsize from decode_partition().
   xd->mi[0]->mbmi.sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  xd->mi[0]->mbmi.mi_row = mi_row;
+  xd->mi[0]->mbmi.mi_col = mi_col;
+#endif
   for (y = 0; y < y_mis; ++y)
     for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
 
   set_plane_n4(xd, bw, bh, bwl, bhl);
   set_skip_context(xd, mi_row, mi_col);
 
+#if CONFIG_VAR_TX
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
+
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
@@ -574,22 +631,762 @@
   return &xd->mi[0]->mbmi;
 }
 
+#if CONFIG_SUPERTX
+static MB_MODE_INFO *set_offsets_extend(AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        const TileInfo *const tile,
+                                        BLOCK_SIZE bsize_pred, int mi_row_pred,
+                                        int mi_col_pred, int mi_row_ori,
+                                        int mi_col_ori) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  const int bw = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int bh = num_8x8_blocks_high_lookup[bsize_pred];
+  const int offset = mi_row_ori * cm->mi_stride + mi_col_ori;
+  const int bwl = b_width_log2_lookup[bsize_pred];
+  const int bhl = b_height_log2_lookup[bsize_pred];
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw, cm->mi_rows,
+                 cm->mi_cols);
+
+  xd->up_available = (mi_row_ori > tile->mi_row_start);
+  xd->left_available = (mi_col_ori > tile->mi_col_start);
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  return &xd->mi[0]->mbmi;
+}
+
+static MB_MODE_INFO *set_mb_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int bw, int bh, int x_mis, int y_mis) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x) xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+  return &xd->mi[0]->mbmi;
+}
+
+static void set_offsets_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile, BLOCK_SIZE bsize,
+                                 int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const int bwl = b_width_log2_lookup[bsize];
+  const int bhl = b_height_log2_lookup[bsize];
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+}
+
+static void set_param_topblock(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+                               int txfm, int skip) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  for (y = 0; y < y_mis; ++y)
+    for (x = 0; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
+      xd->mi[y * cm->mi_stride + x]->mbmi.tx_type = txfm;
+    }
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  set_txfm_ctxs(xd->mi[0]->mbmi.tx_size, bw, bh, xd);
+#endif
+}
+
+static void set_ref(AV1_COMMON *const cm, MACROBLOCKD *const xd, int idx,
+                    int mi_row, int mi_col) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
+  xd->block_refs[idx] = ref_buffer;
+  if (!av1_is_valid_scale(&ref_buffer->sf))
+    aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+                       "Invalid scale factors");
+  av1_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
+                       &ref_buffer->sf);
+  xd->corrupted |= ref_buffer->buf->corrupted;
+}
+
+static void dec_predict_b_extend(
+    AV1Decoder *const pbi, MACROBLOCKD *const xd, const TileInfo *const tile,
+    int block, int mi_row_ori, int mi_col_ori, int mi_row_pred, int mi_col_pred,
+    int mi_row_top, int mi_col_top, uint8_t *dst_buf[3], int dst_stride[3],
+    BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred, int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+  MB_MODE_INFO *mbmi;
+  AV1_COMMON *const cm = &pbi->common;
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
+      mi_col_pred >= cm->mi_cols)
+    return;
+
+  mbmi = set_offsets_extend(cm, xd, tile, bsize_pred, mi_row_pred, mi_col_pred,
+                            mi_row_ori, mi_col_ori);
+  set_ref(cm, xd, 0, mi_row_pred, mi_col_pred);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_pred, mi_col_pred);
+
+  if (!bextend) {
+    mbmi->tx_size = b_width_log2_lookup[bsize_top];
+  }
+
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  if (!b_sub8x8)
+    av1_build_inter_predictors_sb_extend(xd,
+#if CONFIG_EXT_INTER
+                                         mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                         mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    av1_build_inter_predictors_sb_sub8x8_extend(xd,
+#if CONFIG_EXT_INTER
+                                                mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                mi_row_pred, mi_col_pred,
+                                                bsize_pred, block);
+}
+
+static void dec_extend_dir(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, int mi_row,
+                           int mi_col, int mi_row_top, int mi_col_top,
+                           uint8_t *dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss)
+                       ? BLOCK_8X8
+                       : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col, mi_row_pred,
+                         mi_col_pred, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      assert(!b_sub8x8);
+      for (i = 0; i < mi_width / unit - 1; i++) {
+        mi_col_pred += unit;
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col, mi_row_pred,
+                             mi_col_pred, mi_row_top, mi_col_top, dst_buf,
+                             dst_stride, top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss)
+                       ? BLOCK_8X8
+                       : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col, mi_row_pred,
+                         mi_col_pred, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height / unit - 1; i++) {
+        mi_row_pred += unit;
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col, mi_row_pred,
+                             mi_col_pred, mi_row_top, mi_col_top, dst_buf,
+                             dst_stride, top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col, mi_row_pred,
+                         mi_col_pred, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, b_sub8x8, 1);
+  }
+}
+
+static void dec_extend_all(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize, int mi_row,
+                           int mi_col, int mi_row_top, int mi_col_top,
+                           uint8_t *dst_buf[3], int dst_stride[3]) {
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 1);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 2);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 4);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 5);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 6);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 7);
+}
+
+static void dec_predict_sb_complex(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+                                   const TileInfo *const tile, int mi_row,
+                                   int mi_col, int mi_row_top, int mi_col_top,
+                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                                   uint8_t *dst_buf[3], int dst_stride[3]) {
+  const AV1_COMMON *const cm = &pbi->common;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  int i;
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+  } else {
+#endif
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
+#if CONFIG_AOM_HIGHBITDEPTH
+  }
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  xd->mi[0] = cm->mi + mi_offset;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8) {
+        // For sub8x8, predict in 8x8 unit
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // weighted average to smooth the boundary
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1, top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, 1);
+
+          // weighted average to smooth the boundary
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8) {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+        // Second half
+        if (mi_col + hbs < cm->mi_cols) {
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                               mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                               dst_stride1, top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, 2);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize) {
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+          dec_extend_all(pbi, xd, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3);
+        }
+      } else {
+        dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row_top,
+                               mi_col_top, subsize, top_bsize, dst_buf,
+                               dst_stride);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf1, dst_stride1);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf2, dst_stride2);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf3, dst_stride3);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        if (bsize == BLOCK_8X8 && i != 0)
+          continue;  // Skip <4x4 chroma smoothing
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_VERT, i);
+          if (mi_row + hbs < cm->mi_rows) {
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_HORZ, i);
+        }
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf2,
+                           dst_stride2, top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      break;
+    case PARTITION_VERT_A:
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                           dst_stride2, top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                     dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      break;
+    case PARTITION_VERT_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                     mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                     dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+}
+
+static void set_segment_id_supertx(const AV1_COMMON *const cm, const int mi_row,
+                                   const int mi_col, const BLOCK_SIZE bsize) {
+  const struct segmentation *seg = &cm->seg;
+  const int miw =
+      AOMMIN(num_8x8_blocks_wide_lookup[bsize], cm->mi_cols - mi_col);
+  const int mih =
+      AOMMIN(num_8x8_blocks_high_lookup[bsize], cm->mi_rows - mi_row);
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
+  int r, c;
+  int seg_id_supertx = MAX_SEGMENTS;
+
+  if (!seg->enabled) {
+    seg_id_supertx = 0;
+  } else {
+    // Find the minimum segment_id
+    for (r = 0; r < mih; r++)
+      for (c = 0; c < miw; c++)
+        seg_id_supertx =
+            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
+    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
+  }
+
+  // Assign the the segment_id back to segment_id_supertx
+  for (r = 0; r < mih; r++)
+    for (c = 0; c < miw; c++)
+      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+}
+#endif  // CONFIG_SUPERTX
+
 static void decode_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
                          int mi_row, int mi_col, aom_reader *r,
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_TYPE partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
                          BLOCK_SIZE bsize, int bwl, int bhl) {
   AV1_COMMON *const cm = &pbi->common;
-  const int less8x8 = bsize < BLOCK_8X8;
   const int bw = 1 << (bwl - 1);
   const int bh = 1 << (bhl - 1);
   const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
-
-  MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis,
-                                   y_mis, bwl, bhl);
+  MB_MODE_INFO *mbmi;
 
 #if CONFIG_ACCOUNTING
   aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
 #endif
+#if CONFIG_SUPERTX
+  if (supertx_enabled) {
+    mbmi = set_mb_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+  } else {
+    mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis, bwl,
+                       bhl);
+  }
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
+  av1_read_mode_info(pbi, xd, supertx_enabled, mi_row, mi_col, r, x_mis, y_mis);
+#else
+  mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis, bwl,
+                     bhl);
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
+  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+#endif  // CONFIG_SUPERTX
 
   if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -599,7 +1396,14 @@
                          "Invalid block size.");
   }
 
-  av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+#if CONFIG_SUPERTX
+  mbmi->segment_id_supertx = MAX_SEGMENTS;
+
+  if (supertx_enabled) {
+    xd->corrupted |= aom_reader_has_error(r);
+    return;
+  }
+#endif  // CONFIG_SUPERTX
 
 #if CONFIG_DELTA_Q
   if (cm->delta_q_present_flag) {
@@ -624,7 +1428,6 @@
   if (mbmi->skip) {
     dec_reset_skip_context(xd);
   }
-
   if (!is_inter_block(mbmi)) {
     int plane;
 #if CONFIG_PALETTE
@@ -638,7 +1441,8 @@
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
       const int num_4x4_w = pd->n4_w;
       const int num_4x4_h = pd->n4_h;
-      const int step = tx_size_1d_in_unit[tx_size];
+      const int stepr = tx_size_high_unit[tx_size];
+      const int stepc = tx_size_wide_unit[tx_size];
       int row, col;
       const int max_blocks_wide =
           num_4x4_w + (xd->mb_to_right_edge >= 0
@@ -649,8 +1453,8 @@
                            ? 0
                            : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
-      for (row = 0; row < max_blocks_high; row += step)
-        for (col = 0; col < max_blocks_wide; col += step)
+      for (row = 0; row < max_blocks_high; row += stepr)
+        for (col = 0; col < max_blocks_wide; col += stepc)
           predict_and_reconstruct_intra_block(cm, xd, r, mbmi, plane, row, col,
                                               tx_size);
     }
@@ -658,9 +1462,11 @@
     // Prediction
     av1_build_inter_predictors_sb(xd, mi_row, mi_col, AOMMAX(bsize, BLOCK_8X8));
 #if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL)
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
       av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    }
 #endif  // CONFIG_MOTION_VAR
+
     // Reconstruction
     if (!mbmi->skip) {
       int eobtotal = 0;
@@ -668,29 +1474,68 @@
 
       for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         const struct macroblockd_plane *const pd = &xd->plane[plane];
+        int block_width = pd->width;
+        int block_height = pd->height;
+        int row, col;
+#if CONFIG_VAR_TX
+        // TODO(jingning): This can be simplified for decoder performance.
+        const BLOCK_SIZE plane_bsize =
+            get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+        const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+        const int bh_var_tx = tx_size_high_unit[max_tx_size];
+        const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+        if (is_rect_tx(mbmi->tx_size)) {
+          const TX_SIZE tx_size =
+              plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+          const int stepr = tx_size_high_unit[tx_size];
+          const int stepc = tx_size_wide_unit[tx_size];
+          int max_blocks_wide =
+              block_width +
+              (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
+                                                   (3 + pd->subsampling_x));
+          int max_blocks_high =
+              block_height +
+              (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
+                                                    (3 + pd->subsampling_y));
+          max_blocks_wide >>= tx_size_wide_log2[0];
+          max_blocks_high >>= tx_size_wide_log2[0];
+          for (row = 0; row < max_blocks_high; row += stepr)
+            for (col = 0; col < max_blocks_wide; col += stepc)
+              eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
+                                                  plane, row, col, tx_size);
+        } else {
+#endif
+          block_width >>= tx_size_wide_log2[0];
+          block_height >>= tx_size_wide_log2[0];
+          for (row = 0; row < block_height; row += bh_var_tx)
+            for (col = 0; col < block_width; col += bw_var_tx)
+              decode_reconstruct_tx(cm, xd, r, mbmi, plane, plane_bsize, row,
+                                    col, max_tx_size, &eobtotal);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+        }
+#endif
+#else
         const TX_SIZE tx_size =
             plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
-        const int num_4x4_w = pd->n4_w;
-        const int num_4x4_h = pd->n4_h;
-        const int step = tx_size_1d_in_unit[tx_size];
-        int row, col;
-        const int max_blocks_wide =
-            num_4x4_w + (xd->mb_to_right_edge >= 0
-                             ? 0
-                             : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-        const int max_blocks_high =
-            num_4x4_h +
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        int max_blocks_wide =
+            block_width +
+            (xd->mb_to_right_edge >= 0 ? 0 : xd->mb_to_right_edge >>
+                                                 (3 + pd->subsampling_x));
+        int max_blocks_high =
+            block_height +
             (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
-                                                  (5 + pd->subsampling_y));
-
-        for (row = 0; row < max_blocks_high; row += step)
-          for (col = 0; col < max_blocks_wide; col += step)
-            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi, plane, row,
-                                                col, tx_size);
+                                                  (3 + pd->subsampling_y));
+        max_blocks_wide >>= tx_size_wide_log2[0];
+        max_blocks_high >>= tx_size_wide_log2[0];
+        for (row = 0; row < max_blocks_high; row += stepr)
+          for (col = 0; col < max_blocks_wide; col += stepc)
+            eobtotal += reconstruct_inter_block(cm, xd, r, mbmi->segment_id,
+                                                plane, row, col, tx_size);
+#endif
       }
-
-      if (!less8x8 && eobtotal == 0)
-        mbmi->has_no_coeffs = 1;  // skip loopfilter
     }
   }
 
@@ -709,6 +1554,7 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+#if !CONFIG_EXT_PARTITION_TYPES
 static INLINE void dec_update_partition_context(MACROBLOCKD *xd, int mi_row,
                                                 int mi_col, BLOCK_SIZE subsize,
                                                 int bw) {
@@ -722,22 +1568,35 @@
   memset(above_ctx, partition_context_lookup[subsize].above, bw);
   memset(left_ctx, partition_context_lookup[subsize].left, bw);
 }
+#endif  // !CONFIG_EXT_PARTITION_TYPES
 
 static PARTITION_TYPE read_partition(AV1_COMMON *cm, MACROBLOCKD *xd,
                                      int mi_row, int mi_col, aom_reader *r,
-                                     int has_rows, int has_cols, int bsl) {
+                                     int has_rows, int has_cols,
+#if CONFIG_EXT_PARTITION_TYPES
+                                     BLOCK_SIZE bsize,
+#endif
+                                     int bsl) {
   const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
   const aom_prob *const probs = cm->fc->partition_prob[ctx];
   FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
 
   if (has_rows && has_cols)
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+      p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs, ACCT_STR);
+    else
+      p = (PARTITION_TYPE)aom_read_tree(r, av1_ext_partition_tree, probs,
+                                        ACCT_STR);
+#else
 #if CONFIG_DAALA_EC
     p = (PARTITION_TYPE)aom_read_symbol(r, cm->fc->partition_cdf[ctx],
                                         PARTITION_TYPES, ACCT_STR);
 #else
     p = (PARTITION_TYPE)aom_read_tree(r, av1_partition_tree, probs, ACCT_STR);
 #endif
+#endif  // CONFIG_EXT_PARTITION_TYPES
   else if (!has_rows && has_cols)
     p = aom_read(r, probs[1], ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
@@ -750,6 +1609,20 @@
   return p;
 }
 
+#if CONFIG_SUPERTX
+static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+                     aom_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = av1_get_skip_context(xd);
+    const int skip = aom_read(r, cm->fc->skip_probs[ctx], ACCT_STR);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts) ++counts->skip[ctx][skip];
+    return skip;
+  }
+}
+#endif  // CONFIG_SUPERTX
 #if CONFIG_CLPF
 static int clpf_all_skip(const AV1_COMMON *cm, int mi_col, int mi_row,
                          int size) {
@@ -769,6 +1642,9 @@
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                             int supertx_enabled,
+#endif
                              int mi_row, int mi_col, aom_reader *r,
                              BLOCK_SIZE bsize, int n4x4_l2) {
   AV1_COMMON *const cm = &pbi->common;
@@ -777,56 +1653,332 @@
   const int hbs = num_8x8_wh >> 1;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
+#if CONFIG_SUPERTX
+  const int read_token = !supertx_enabled;
+  int skip = 0;
+  TX_SIZE supertx_size = b_width_log2_lookup[bsize];
+  const TileInfo *const tile = &xd->tile;
+  int txfm = DCT_DCT;
+#endif  // CONFIG_SUPERTX
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  partition =
-      read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols, n8x8_l2);
+  partition = read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols,
+#if CONFIG_EXT_PARTITION_TYPES
+                             bsize,
+#endif
+                             n8x8_l2);
   subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
 
 #if CONFIG_PVQ
   assert(partition < PARTITION_TYPES);
   assert(subsize < BLOCK_SIZES);
 #endif
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) && partition != PARTITION_NONE &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE && !supertx_enabled && !xd->lossless[0]) {
+    const int supertx_context = partition_supertx_context_lookup[partition];
+    supertx_enabled = aom_read(
+        r, cm->fc->supertx_prob[supertx_context][supertx_size], ACCT_STR);
+    if (xd->counts)
+      xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
+#if CONFIG_VAR_TX
+    if (supertx_enabled) xd->supertx_size = supertx_size;
+#endif
+  }
+#endif  // CONFIG_SUPERTX
   if (!hbs) {
     // calculate bmode block dimensions (log 2)
     xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
     xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
+    decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                 supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                 mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                 subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
-          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
-                       n8x8_l2);
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row + hbs, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                       subsize, n4x4_l2, n8x8_l2);
         break;
       case PARTITION_VERT:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
-          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
-                       n4x4_l2);
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row, mi_col + hbs, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                       subsize, n8x8_l2, n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
-                         n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col + hbs, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col, r, subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col + hbs, r, subsize, n8x8_l2);
         break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r, partition, bsize2, n8x8_l2,
+                     n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r, partition, subsize, n4x4_l2,
+                     n8x8_l2);
+        break;
+      case PARTITION_HORZ_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, subsize, n4x4_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r, partition, bsize2, n8x8_l2,
+                     n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r, partition, bsize2, n8x8_l2,
+                     n8x8_l2);
+        break;
+      case PARTITION_VERT_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r, partition, bsize2, n8x8_l2,
+                     n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r, partition, subsize, n8x8_l2,
+                     n4x4_l2);
+        break;
+      case PARTITION_VERT_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r, partition, subsize, n8x8_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r, partition, bsize2, n8x8_l2,
+                     n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r, partition, bsize2, n8x8_l2,
+                     n8x8_l2);
+        break;
+#endif
       default: assert(0 && "Invalid partition type");
     }
   }
 
+#if CONFIG_SUPERTX
+  if (supertx_enabled && read_token) {
+    uint8_t *dst_buf[3];
+    int dst_stride[3], i;
+    int offset = mi_row * cm->mi_stride + mi_col;
+
+    set_segment_id_supertx(cm, mi_row, mi_col, bsize);
+
+    xd->mi = cm->mi_grid_visible + offset;
+    xd->mi[0] = cm->mi + offset;
+    set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize], mi_col,
+                   num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, cm->mi_cols);
+    set_skip_context(xd, mi_row, mi_col);
+    skip = read_skip(cm, xd, xd->mi[0]->mbmi.segment_id_supertx, r);
+    if (skip) {
+      reset_skip_context(xd, bsize);
+    } else {
+#if CONFIG_EXT_TX
+      if (get_ext_tx_types(supertx_size, bsize, 1) > 1) {
+        int eset = get_ext_tx_set(supertx_size, bsize, 1);
+        if (eset > 0) {
+          txfm = aom_read_tree(r, av1_ext_tx_inter_tree[eset],
+                               cm->fc->inter_ext_tx_prob[eset][supertx_size],
+                               ACCT_STR);
+          if (xd->counts) ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
+        }
+      }
+#else
+      if (supertx_size < TX_32X32) {
+        txfm = aom_read_tree(r, av1_ext_tx_tree,
+                             cm->fc->inter_ext_tx_prob[supertx_size], ACCT_STR);
+        if (xd->counts) ++xd->counts->inter_ext_tx[supertx_size][txfm];
+      }
+#endif  // CONFIG_EXT_TX
+    }
+
+    av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      dst_buf[i] = xd->plane[i].dst.buf;
+      dst_stride[i] = xd->plane[i].dst.stride;
+    }
+    dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row, mi_col, bsize,
+                           bsize, dst_buf, dst_stride);
+
+    if (!skip) {
+      int eobtotal = 0;
+      MB_MODE_INFO *mbmi;
+      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
+      mbmi = &xd->mi[0]->mbmi;
+      mbmi->tx_type = txfm;
+      assert(mbmi->segment_id_supertx != MAX_SEGMENTS);
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const struct macroblockd_plane *const pd = &xd->plane[i];
+        int row, col;
+        const TX_SIZE tx_size = i ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+        const int stepr = tx_size_high_unit[tx_size];
+        const int stepc = tx_size_wide_unit[tx_size];
+        int max_blocks_wide =
+            pd->width + (xd->mb_to_right_edge >= 0
+                             ? 0
+                             : xd->mb_to_right_edge >> (3 + pd->subsampling_x));
+        int max_blocks_high =
+            pd->height +
+            (xd->mb_to_bottom_edge >= 0 ? 0 : xd->mb_to_bottom_edge >>
+                                                  (3 + pd->subsampling_y));
+
+        max_blocks_wide >>= tx_size_wide_log2[0];
+        max_blocks_high >>= tx_size_wide_log2[0];
+
+        for (row = 0; row < max_blocks_high; row += stepr)
+          for (col = 0; col < max_blocks_wide; col += stepc)
+            eobtotal += reconstruct_inter_block(
+                cm, xd, r, mbmi->segment_id_supertx, i, row, col, tx_size);
+      }
+      if (!(subsize < BLOCK_8X8) && eobtotal == 0) skip = 1;
+    }
+    set_param_topblock(cm, xd, bsize, mi_row, mi_col, txfm, skip);
+  }
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize >= BLOCK_8X8) {
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize > BLOCK_8X8) break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+#else
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_DERING
   if (bsize == BLOCK_64X64) {
@@ -878,11 +2030,12 @@
 #endif
 }
 
-static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
-                                size_t read_size,
-                                struct aom_internal_error_info *error_info,
-                                aom_reader *r, aom_decrypt_cb decrypt_cb,
-                                void *decrypt_state) {
+#if !CONFIG_ANS
+static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
+                               const size_t read_size,
+                               struct aom_internal_error_info *error_info,
+                               aom_reader *r, aom_decrypt_cb decrypt_cb,
+                               void *decrypt_state) {
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
@@ -894,6 +2047,27 @@
     aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
+#else
+static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
+                                const size_t read_size,
+                                struct aom_internal_error_info *error_info,
+                                struct AnsDecoder *const ans,
+                                aom_decrypt_cb decrypt_cb,
+                                void *decrypt_state) {
+  (void)decrypt_cb;
+  (void)decrypt_state;
+  // Validate the calculated partition length. If the buffer
+  // described by the partition can't be fully read, then restrict
+  // it to the portion that can be (for EC mode) or throw an error.
+  if (!read_is_valid(data, read_size, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+
+  if (read_size > INT_MAX || ans_read_init(ans, data, (int)read_size))
+    aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate token decoder %d", 1);
+}
+#endif
 
 #if !CONFIG_PVQ
 static void read_coef_probs_common(av1_coeff_probs_model *coef_probs,
@@ -970,8 +2144,131 @@
   }
 }
 
-static void setup_loopfilter(struct loopfilter *lf,
-                             struct aom_read_bit_buffer *rb) {
+#if CONFIG_LOOP_RESTORATION
+static void decode_restoration_mode(AV1_COMMON *cm,
+                                    struct aom_read_bit_buffer *rb) {
+  RestorationInfo *rsi = &cm->rst_info;
+  if (aom_rb_read_bit(rb)) {
+    rsi->frame_restoration_type =
+        aom_rb_read_bit(rb) ? RESTORE_WIENER : RESTORE_BILATERAL;
+  } else {
+    rsi->frame_restoration_type =
+        aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+  }
+}
+
+static void decode_restoration(AV1_COMMON *cm, aom_reader *rb) {
+  int i;
+  RestorationInfo *rsi = &cm->rst_info;
+  const int ntiles =
+      av1_get_rest_ntiles(cm->width, cm->height, NULL, NULL, NULL, NULL);
+  if (rsi->frame_restoration_type != RESTORE_NONE) {
+    rsi->restoration_type = (RestorationType *)aom_realloc(
+        rsi->restoration_type, sizeof(*rsi->restoration_type) * ntiles);
+    if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+      rsi->bilateral_info = (BilateralInfo *)aom_realloc(
+          rsi->bilateral_info, sizeof(*rsi->bilateral_info) * ntiles);
+      assert(rsi->bilateral_info != NULL);
+      rsi->wiener_info = (WienerInfo *)aom_realloc(
+          rsi->wiener_info, sizeof(*rsi->wiener_info) * ntiles);
+      assert(rsi->wiener_info != NULL);
+      for (i = 0; i < ntiles; ++i) {
+        rsi->restoration_type[i] =
+            aom_read_tree(rb, av1_switchable_restore_tree,
+                          cm->fc->switchable_restore_prob, ACCT_STR);
+        if (rsi->restoration_type[i] == RESTORE_WIENER) {
+          rsi->wiener_info[i].level = 1;
+          rsi->wiener_info[i].vfilter[0] =
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
+              WIENER_FILT_TAP0_MINV;
+          rsi->wiener_info[i].vfilter[1] =
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
+              WIENER_FILT_TAP1_MINV;
+          rsi->wiener_info[i].vfilter[2] =
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
+              WIENER_FILT_TAP2_MINV;
+          rsi->wiener_info[i].hfilter[0] =
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
+              WIENER_FILT_TAP0_MINV;
+          rsi->wiener_info[i].hfilter[1] =
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
+              WIENER_FILT_TAP1_MINV;
+          rsi->wiener_info[i].hfilter[2] =
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
+              WIENER_FILT_TAP2_MINV;
+        } else if (rsi->restoration_type[i] == RESTORE_BILATERAL) {
+          int s;
+          for (s = 0; s < BILATERAL_SUBTILES; ++s) {
+#if BILATERAL_SUBTILES == 0
+            rsi->bilateral_info[i].level[s] =
+                aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
+#else
+            if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB, ACCT_STR)) {
+              rsi->bilateral_info[i].level[s] =
+                  aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
+            } else {
+              rsi->bilateral_info[i].level[s] = -1;
+            }
+#endif
+          }
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+      rsi->wiener_info = (WienerInfo *)aom_realloc(
+          rsi->wiener_info, sizeof(*rsi->wiener_info) * ntiles);
+      assert(rsi->wiener_info != NULL);
+      for (i = 0; i < ntiles; ++i) {
+        if (aom_read(rb, RESTORE_NONE_WIENER_PROB, ACCT_STR)) {
+          rsi->wiener_info[i].level = 1;
+          rsi->restoration_type[i] = RESTORE_WIENER;
+          rsi->wiener_info[i].vfilter[0] =
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
+              WIENER_FILT_TAP0_MINV;
+          rsi->wiener_info[i].vfilter[1] =
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
+              WIENER_FILT_TAP1_MINV;
+          rsi->wiener_info[i].vfilter[2] =
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
+              WIENER_FILT_TAP2_MINV;
+          rsi->wiener_info[i].hfilter[0] =
+              aom_read_literal(rb, WIENER_FILT_TAP0_BITS, ACCT_STR) +
+              WIENER_FILT_TAP0_MINV;
+          rsi->wiener_info[i].hfilter[1] =
+              aom_read_literal(rb, WIENER_FILT_TAP1_BITS, ACCT_STR) +
+              WIENER_FILT_TAP1_MINV;
+          rsi->wiener_info[i].hfilter[2] =
+              aom_read_literal(rb, WIENER_FILT_TAP2_BITS, ACCT_STR) +
+              WIENER_FILT_TAP2_MINV;
+        } else {
+          rsi->wiener_info[i].level = 0;
+          rsi->restoration_type[i] = RESTORE_NONE;
+        }
+      }
+    } else {
+      rsi->bilateral_info = (BilateralInfo *)aom_realloc(
+          rsi->bilateral_info, sizeof(*rsi->bilateral_info) * ntiles);
+      assert(rsi->bilateral_info != NULL);
+      for (i = 0; i < ntiles; ++i) {
+        int s;
+        rsi->restoration_type[i] = RESTORE_BILATERAL;
+        for (s = 0; s < BILATERAL_SUBTILES; ++s) {
+          if (aom_read(rb, RESTORE_NONE_BILATERAL_PROB, ACCT_STR)) {
+            rsi->bilateral_info[i].level[s] =
+                aom_read_literal(rb, av1_bilateral_level_bits(cm), ACCT_STR);
+          } else {
+            rsi->bilateral_info[i].level[s] = -1;
+          }
+        }
+      }
+    }
+  } else {
+    rsi->frame_restoration_type = RESTORE_NONE;
+  }
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
+static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+  struct loopfilter *lf = &cm->lf;
   lf->filter_level = aom_rb_read_literal(rb, 6);
   lf->sharpness_level = aom_rb_read_literal(rb, 3);
 
@@ -985,7 +2282,7 @@
     if (lf->mode_ref_delta_update) {
       int i;
 
-      for (i = 0; i < MAX_REF_FRAMES; i++)
+      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++)
         if (aom_rb_read_bit(rb))
           lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
 
@@ -1071,6 +2368,10 @@
   int minqm = cm->min_qmlevel;
   int maxqm = cm->max_qmlevel;
 #endif
+#if CONFIG_NEW_QUANT
+  int b;
+  int dq;
+#endif  //  CONFIG_NEW_QUANT
   if (cm->seg.enabled) {
     for (i = 0; i < MAX_SEGMENTS; ++i) {
       const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
@@ -1095,7 +2396,17 @@
         cm->uv_iqmatrix[i][1][j] = aom_iqmatrix(cm, qmlevel, 1, j, 1);
         cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
       }
-#endif
+#endif  // CONFIG_AOM_QM
+#if CONFIG_NEW_QUANT
+      for (dq = 0; dq < QUANT_PROFILES; dq++) {
+        for (b = 0; b < COEF_BANDS; ++b) {
+          av1_get_dequant_val_nuq(cm->y_dequant[i][b != 0], b,
+                                  cm->y_dequant_nuq[i][dq][b], NULL, dq);
+          av1_get_dequant_val_nuq(cm->uv_dequant[i][b != 0], b,
+                                  cm->uv_dequant_nuq[i][dq][b], NULL, dq);
+        }
+      }
+#endif  //  CONFIG_NEW_QUANT
     }
   } else {
     const int qindex = cm->base_qindex;
@@ -1121,6 +2432,16 @@
       cm->uv_iqmatrix[i][0][j] = aom_iqmatrix(cm, qmlevel, 1, j, 0);
     }
 #endif
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq++) {
+      for (b = 0; b < COEF_BANDS; ++b) {
+        av1_get_dequant_val_nuq(cm->y_dequant[0][b != 0], b,
+                                cm->y_dequant_nuq[0][dq][b], NULL, dq);
+        av1_get_dequant_val_nuq(cm->uv_dequant[0][b != 0], b,
+                                cm->uv_dequant_nuq[0][dq][b], NULL, dq);
+      }
+    }
+#endif  //  CONFIG_NEW_QUANT
   }
 }
 
@@ -1140,8 +2461,9 @@
   aom_free(cm->cur_frame->mvs);
   cm->cur_frame->mi_rows = cm->mi_rows;
   cm->cur_frame->mi_cols = cm->mi_cols;
-  cm->cur_frame->mvs = (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
-                                            sizeof(*cm->cur_frame->mvs));
+  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+                  (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
+                                       sizeof(*cm->cur_frame->mvs)));
 }
 
 static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
@@ -1222,7 +2544,7 @@
   int found = 0, i;
   int has_valid_ref_frame = 0;
   BufferPool *const pool = cm->buffer_pool;
-  for (i = 0; i < REFS_PER_FRAME; ++i) {
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     if (aom_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
@@ -1245,7 +2567,7 @@
 
   // Check to make sure at least one of frames that this frame references
   // has valid dimensions.
-  for (i = 0; i < REFS_PER_FRAME; ++i) {
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     has_valid_ref_frame |=
         valid_ref_frame_size(ref_frame->buf->y_crop_width,
@@ -1254,7 +2576,7 @@
   if (!has_valid_ref_frame)
     aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                        "Referenced frame has invalid size");
-  for (i = 0; i < REFS_PER_FRAME; ++i) {
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     if (!valid_ref_frame_img_fmt(ref_frame->buf->bit_depth,
                                  ref_frame->buf->subsampling_x,
@@ -1265,6 +2587,7 @@
   }
 
   resize_context_buffers(cm, width, height);
+
   lock_buffer_pool(pool);
   if (aom_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
@@ -1290,8 +2613,41 @@
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
-static void setup_tile_info(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
-  AV1_COMMON *cm = &pbi->common;
+static void read_tile_info(AV1Decoder *const pbi,
+                           struct aom_read_bit_buffer *const rb) {
+  AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_EXT_TILE
+// Read the tile width/height
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    cm->tile_width = aom_rb_read_literal(rb, 5) + 1;
+    cm->tile_height = aom_rb_read_literal(rb, 5) + 1;
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    cm->tile_width = aom_rb_read_literal(rb, 6) + 1;
+    cm->tile_height = aom_rb_read_literal(rb, 6) + 1;
+  }
+
+  cm->tile_width <<= cm->mib_size_log2;
+  cm->tile_height <<= cm->mib_size_log2;
+
+  cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+  cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+
+  // Get the number of tiles
+  cm->tile_cols = 1;
+  while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
+
+  cm->tile_rows = 1;
+  while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
+
+  if (cm->tile_cols * cm->tile_rows > 1) {
+    // Read the number of bytes used to store tile size
+    pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+  }
+#else
   int min_log2_tile_cols, max_log2_tile_cols, max_ones;
   av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
@@ -1307,10 +2663,24 @@
   // rows
   cm->log2_tile_rows = aom_rb_read_bit(rb);
   if (cm->log2_tile_rows) cm->log2_tile_rows += aom_rb_read_bit(rb);
+
+  cm->tile_cols = 1 << cm->log2_tile_cols;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
+
+  // round to integer multiples of superblock size
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+
   // tile size magnitude
-  if (cm->log2_tile_rows > 0 || cm->log2_tile_cols > 0) {
-    cm->tile_sz_mag = aom_rb_read_literal(rb, 2);
+  if (cm->tile_rows > 1 || cm->tile_cols > 1) {
+    pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
   }
+#endif  // CONFIG_EXT_TILE
 #if CONFIG_TILE_GROUPS
   // Store an index to the location of the tile group information
   pbi->tg_size_bit_offset = rb->bit_offset;
@@ -1324,32 +2694,168 @@
 #endif
 }
 
-typedef struct TileBuffer {
-  const uint8_t *data;
-  size_t size;
-  int col;  // only used with multi-threaded decoding
-} TileBuffer;
-
-static int mem_get_varsize(const uint8_t *data, const int mag) {
-  switch (mag) {
-    case 0: return data[0];
-    case 1: return mem_get_le16(data);
-    case 2: return mem_get_le24(data);
-    case 3: return mem_get_le32(data);
+static int mem_get_varsize(const uint8_t *src, const int sz) {
+  switch (sz) {
+    case 1: return src[0];
+    case 2: return mem_get_le16(src);
+    case 3: return mem_get_le24(src);
+    case 4: return mem_get_le32(src);
+    default: assert("Invalid size" && 0); return -1;
   }
-
-  assert("Invalid tile size marker value" && 0);
-
-  return -1;
 }
 
+#if CONFIG_EXT_TILE
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
 static void get_tile_buffer(const uint8_t *const data_end,
-                            const int tile_sz_mag, int is_last,
                             struct aom_internal_error_info *error_info,
                             const uint8_t **data, aom_decrypt_cb decrypt_cb,
-                            void *decrypt_state, TileBuffer *buf) {
+                            void *decrypt_state,
+                            TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+                            int tile_size_bytes, int col, int row) {
+  size_t size;
+
+  size_t copy_size = 0;
+  const uint8_t *copy_data = NULL;
+
+  if (!read_is_valid(*data, tile_size_bytes, data_end))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+  if (decrypt_cb) {
+    uint8_t be_data[4];
+    decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
+
+    // Only read number of bytes in cm->tile_size_bytes.
+    size = mem_get_varsize(be_data, tile_size_bytes);
+  } else {
+    size = mem_get_varsize(*data, tile_size_bytes);
+  }
+
+  // The top bit indicates copy mode
+  if ((size >> (tile_size_bytes * 8 - 1)) == 1) {
+    // The remaining bits in the top byte signal the row offset
+    int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+
+    // Currently, only use tiles in same column as reference tiles.
+    copy_data = tile_buffers[row - offset][col].data;
+    copy_size = tile_buffers[row - offset][col].size;
+    size = 0;
+  }
+
+  *data += tile_size_bytes;
+
+  if (size > (size_t)(data_end - *data))
+    aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile size");
+
+  if (size > 0) {
+    tile_buffers[row][col].data = *data;
+    tile_buffers[row][col].size = size;
+  } else {
+    tile_buffers[row][col].data = copy_data;
+    tile_buffers[row][col].size = copy_size;
+  }
+
+  *data += size;
+
+  tile_buffers[row][col].raw_data_end = *data;
+}
+
+static void get_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  AV1_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int have_tiles = tile_cols * tile_rows > 1;
+
+  if (!have_tiles) {
+    const uint32_t tile_size = data_end - data;
+    tile_buffers[0][0].data = data;
+    tile_buffers[0][0].size = tile_size;
+    tile_buffers[0][0].raw_data_end = NULL;
+  } else {
+    // We locate only the tile buffers that are required, which are the ones
+    // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
+    // need the last (bottom right) tile buffer, as we need to know where the
+    // end of the compressed frame buffer is for proper superframe decoding.
+
+    const uint8_t *tile_col_data_end[MAX_TILE_COLS];
+    const uint8_t *const data_start = data;
+
+    const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+    const int single_row = pbi->dec_tile_row >= 0;
+    const int tile_rows_start = single_row ? dec_tile_row : 0;
+    const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
+    const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+    const int single_col = pbi->dec_tile_col >= 0;
+    const int tile_cols_start = single_col ? dec_tile_col : 0;
+    const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+
+    const int tile_col_size_bytes = pbi->tile_col_size_bytes;
+    const int tile_size_bytes = pbi->tile_size_bytes;
+
+    size_t tile_col_size;
+    int r, c;
+
+    // Read tile column sizes for all columns (we need the last tile buffer)
+    for (c = 0; c < tile_cols; ++c) {
+      const int is_last = c == tile_cols - 1;
+      if (!is_last) {
+        tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
+        data += tile_col_size_bytes;
+        tile_col_data_end[c] = data + tile_col_size;
+      } else {
+        tile_col_size = data_end - data;
+        tile_col_data_end[c] = data_end;
+      }
+      data += tile_col_size;
+    }
+
+    data = data_start;
+
+    // Read the required tile sizes.
+    for (c = tile_cols_start; c < tile_cols_end; ++c) {
+      const int is_last = c == tile_cols - 1;
+
+      if (c > 0) data = tile_col_data_end[c - 1];
+
+      if (!is_last) data += tile_col_size_bytes;
+
+      // Get the whole of the last column, otherwise stop at the required tile.
+      for (r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
+        tile_buffers[r][c].col = c;
+
+        get_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                        pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
+                        tile_size_bytes, c, r);
+      }
+    }
+
+    // If we have not read the last column, then read it to get the last tile.
+    if (tile_cols_end != tile_cols) {
+      c = tile_cols - 1;
+
+      data = tile_col_data_end[c - 1];
+
+      for (r = 0; r < tile_rows; ++r) {
+        tile_buffers[r][c].col = c;
+
+        get_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+                        pbi->decrypt_cb, pbi->decrypt_state, tile_buffers,
+                        tile_size_bytes, c, r);
+      }
+    }
+  }
+}
+#else
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+                            const int tile_size_bytes, int is_last,
+                            struct aom_internal_error_info *error_info,
+                            const uint8_t **data, aom_decrypt_cb decrypt_cb,
+                            void *decrypt_state, TileBufferDec *const buf) {
   size_t size;
 
   if (!is_last) {
@@ -1359,12 +2865,12 @@
 
     if (decrypt_cb) {
       uint8_t be_data[4];
-      decrypt_cb(decrypt_state, *data, be_data, tile_sz_mag + 1);
-      size = mem_get_varsize(be_data, tile_sz_mag) + 1;
+      decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
+      size = mem_get_varsize(be_data, tile_size_bytes);
     } else {
-      size = mem_get_varsize(*data, tile_sz_mag) + 1;
+      size = mem_get_varsize(*data, tile_size_bytes);
     }
-    *data += tile_sz_mag + 1;
+    *data += tile_size_bytes;
 
     if (size > (size_t)(data_end - *data))
       aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
@@ -1379,12 +2885,14 @@
   *data += size;
 }
 
-static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
-                             const uint8_t *data_end, int tile_cols,
-                             int tile_rows,
-                             TileBuffer (*tile_buffers)[1 << 6]) {
+static void get_tile_buffers(
+    AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  AV1_COMMON *const cm = &pbi->common;
 #if CONFIG_TILE_GROUPS
   int r, c;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   int tc = 0;
   int first_tile_in_tg = 0;
   int hdr_offset;
@@ -1397,7 +2905,7 @@
 
   for (r = 0; r < tile_rows; ++r) {
     for (c = 0; c < tile_cols; ++c, ++tc) {
-      TileBuffer *const buf = &tile_buffers[r][c];
+      TileBufferDec *const buf = &tile_buffers[r][c];
       hdr_offset = (tc && tc == first_tile_in_tg) ? hdr_size : 0;
 
       buf->col = c;
@@ -1411,25 +2919,27 @@
       }
       first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
       data += hdr_offset;
-      get_tile_buffer(data_end, pbi->common.tile_sz_mag, 0, &pbi->common.error,
+      get_tile_buffer(data_end, pbi->tile_size_bytes, 0, &pbi->common.error,
                       &data, pbi->decrypt_cb, pbi->decrypt_state, buf);
     }
   }
 #else
   int r, c;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
 
   for (r = 0; r < tile_rows; ++r) {
     for (c = 0; c < tile_cols; ++c) {
       const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
-      TileBuffer *const buf = &tile_buffers[r][c];
+      TileBufferDec *const buf = &tile_buffers[r][c];
       buf->col = c;
-      get_tile_buffer(data_end, pbi->common.tile_sz_mag, is_last,
-                      &pbi->common.error, &data, pbi->decrypt_cb,
-                      pbi->decrypt_state, buf);
+      get_tile_buffer(data_end, pbi->tile_size_bytes, is_last, &cm->error,
+                      &data, pbi->decrypt_cb, pbi->decrypt_state, buf);
     }
   }
 #endif
 }
+#endif  // CONFIG_EXT_TILE
 
 #if CONFIG_PVQ
 static void daala_dec_init(daala_dec_ctx *daala_dec, od_ec_dec *ec) {
@@ -1447,13 +2957,34 @@
                                    const uint8_t *data_end) {
   AV1_COMMON *const cm = &pbi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  TileBuffer tile_buffers[4][1 << 6];
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+#if CONFIG_EXT_TILE
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int tile_rows_start = single_row ? dec_tile_row : 0;
+  const int tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  const int tile_cols_start = single_col ? dec_tile_col : 0;
+  const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  const int inv_col_order = pbi->inv_tile_order && !single_col;
+  const int inv_row_order = pbi->inv_tile_order && !single_row;
+#else
+  const int tile_rows_start = 0;
+  const int tile_rows_end = tile_rows;
+  const int tile_cols_start = 0;
+  const int tile_cols_end = tile_cols;
+  const int inv_col_order = pbi->inv_tile_order;
+  const int inv_row_order = pbi->inv_tile_order;
+#endif  // CONFIG_EXT_TILE
   int tile_row, tile_col;
-  int mi_row, mi_col;
-  TileData *tile_data = NULL;
+
+#if CONFIG_ENTROPY
+  cm->do_subframe_update = n_tiles == 1;
+#endif  // CONFIG_ENTROPY
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
@@ -1474,25 +3005,16 @@
                                pbi->mb.plane);
   }
 
-  assert(tile_rows <= 4);
-  assert(tile_cols <= (1 << 6));
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
+  get_tile_buffers(pbi, data, data_end, tile_buffers);
 
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_cols);
-
-  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
-
-  if (pbi->tile_data == NULL || (tile_cols * tile_rows) != pbi->total_tiles) {
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     aom_free(pbi->tile_data);
-    CHECK_MEM_ERROR(
-        cm, pbi->tile_data,
-        aom_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
-    pbi->total_tiles = tile_rows * tile_cols;
+    CHECK_MEM_ERROR(cm, pbi->tile_data,
+                    aom_memalign(32, n_tiles * (sizeof(*pbi->tile_data))));
+    pbi->allocated_tiles = n_tiles;
   }
 #if CONFIG_ACCOUNTING
   if (pbi->acct_enabled) {
@@ -1500,114 +3022,150 @@
   }
 #endif
   // Load all tile information into tile_data.
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
-      tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
-      tile_data->cm = cm;
-      tile_data->xd = pbi->mb;
-      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts =
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
+      TileData *const td = pbi->tile_data + tile_cols * tile_row + tile_col;
+
+      td->cm = cm;
+      td->xd = pbi->mb;
+      td->xd.corrupted = 0;
+      td->xd.counts =
           cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
               ? &cm->counts
               : NULL;
-      av1_zero(tile_data->dqcoeff);
+      av1_zero(td->dqcoeff);
 #if CONFIG_PVQ
-      av1_zero(tile_data->pvq_ref_coeff);
+      av1_zero(td->pvq_ref_coeff);
 #endif
-      av1_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
+      av1_tile_init(&td->xd.tile, td->cm, tile_row, tile_col);
+#if !CONFIG_ANS
+      setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
+                         &td->bit_reader, pbi->decrypt_cb, pbi->decrypt_state);
+#else
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
-                          &tile_data->bit_reader, pbi->decrypt_cb,
-                          pbi->decrypt_state);
+                          &td->bit_reader, pbi->decrypt_cb, pbi->decrypt_state);
+#endif
 #if CONFIG_ACCOUNTING
       if (pbi->acct_enabled) {
-        tile_data->bit_reader.accounting = &pbi->accounting;
+        td->bit_reader.accounting = &pbi->accounting;
       } else {
-        tile_data->bit_reader.accounting = NULL;
+        td->bit_reader.accounting = NULL;
       }
 #endif
-      av1_init_macroblockd(cm, &tile_data->xd,
+      av1_init_macroblockd(cm, &td->xd,
 #if CONFIG_PVQ
-                           tile_data->pvq_ref_coeff,
+                           td->pvq_ref_coeff,
 #endif
-                           tile_data->dqcoeff);
+                           td->dqcoeff);
 #if CONFIG_PVQ
-      daala_dec_init(&tile_data->xd.daala_dec, &tile_data->bit_reader.ec);
+      daala_dec_init(&td->xd.daala_dec, &td->bit_reader.ec);
 #endif
 #if CONFIG_PALETTE
-      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
-      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
+      td->xd.plane[0].color_index_map = td->color_index_map[0];
+      td->xd.plane[1].color_index_map = td->color_index_map[1];
 #endif  // CONFIG_PALETTE
     }
   }
 
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
-    TileInfo tile;
-    av1_tile_set_row(&tile, cm, tile_row);
-    for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
-         mi_row += MAX_MIB_SIZE) {
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        const int col =
-            pbi->inv_tile_order ? tile_cols - tile_col - 1 : tile_col;
-        tile_data = pbi->tile_data + tile_cols * tile_row + col;
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+    int mi_row = 0;
+    TileInfo tile_info;
+
+    av1_tile_set_row(&tile_info, cm, row);
+
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+      TileData *const td = pbi->tile_data + tile_cols * row + col;
 #if CONFIG_ACCOUNTING
-        if (pbi->acct_enabled) {
-          tile_data->bit_reader.accounting->last_tell_frac =
-              aom_reader_tell_frac(&tile_data->bit_reader);
-        }
+      if (pbi->acct_enabled) {
+        td->bit_reader.accounting->last_tell_frac =
+            aom_reader_tell_frac(&td->bit_reader);
+      }
 #endif
-        av1_tile_set_col(&tile, tile_data->cm, col);
-        av1_zero(tile_data->xd.left_context);
-        av1_zero(tile_data->xd.left_seg_context);
-        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
-             mi_col += MAX_MIB_SIZE) {
-          decode_partition(pbi, &tile_data->xd, mi_row, mi_col,
-                           &tile_data->bit_reader, BLOCK_64X64, 4);
+
+      av1_tile_set_col(&tile_info, cm, col);
+
+      av1_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end);
+
+      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->mib_size) {
+        int mi_col;
+
+        av1_zero_left_context(&td->xd);
+
+        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->mib_size) {
+          decode_partition(pbi, &td->xd,
+#if CONFIG_SUPERTX
+                           0,
+#endif  // CONFIG_SUPERTX
+                           mi_row, mi_col, &td->bit_reader, cm->sb_size,
+                           b_width_log2_lookup[cm->sb_size]);
         }
-        pbi->mb.corrupted |= tile_data->xd.corrupted;
+        pbi->mb.corrupted |= td->xd.corrupted;
         if (pbi->mb.corrupted)
           aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
                              "Failed to decode tile data");
+#if CONFIG_ENTROPY
+        if (cm->do_subframe_update &&
+            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+          if ((mi_row + MI_SIZE) %
+                      (MI_SIZE *
+                       AOMMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1)) ==
+                  0 &&
+              mi_row + MI_SIZE < cm->mi_rows &&
+              cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
+            av1_partial_adapt_probs(cm, mi_row, mi_col);
+            ++cm->coef_probs_update_idx;
+          }
+        }
+#endif  // CONFIG_ENTROPY
       }
+    }
+
+    assert(mi_row > 0);
 
 // when Parallel deblocking is enabled, deblocking should not
 // be interleaved with decoding. Instead, deblocking should be done
 // after the entire frame is decoded.
-#if !CONFIG_PARALLEL_DEBLOCKING
-      // Loopfilter one row.
-      if (cm->lf.filter_level && !cm->skip_loop_filter) {
-        const int lf_start = mi_row - MAX_MIB_SIZE;
-        LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+#if !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
+    // Loopfilter one tile row.
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1;
+      const int lf_start = AOMMAX(0, tile_info.mi_row_start - cm->mib_size);
+      const int lf_end = tile_info.mi_row_end - cm->mib_size;
 
-        // delay the loopfilter by 1 macroblock row.
-        if (lf_start < 0) continue;
+      // Delay the loopfilter if the first tile row is only
+      // a single superblock high.
+      if (lf_end <= 0) continue;
 
-        // decoding has completed: finish up the loop filter in this thread.
-        if (mi_row + MAX_MIB_SIZE >= cm->mi_rows) continue;
+      // Decoding has completed. Finish up the loop filter in this thread.
+      if (tile_info.mi_row_end >= cm->mi_rows) continue;
 
-        winterface->sync(&pbi->lf_worker);
-        lf_data->start = lf_start;
-        lf_data->stop = mi_row;
-        if (pbi->max_threads > 1) {
-          winterface->launch(&pbi->lf_worker);
-        } else {
-          winterface->execute(&pbi->lf_worker);
-        }
+      winterface->sync(&pbi->lf_worker);
+      lf_data->start = lf_start;
+      lf_data->stop = lf_end;
+      if (pbi->max_threads > 1) {
+        winterface->launch(&pbi->lf_worker);
+      } else {
+        winterface->execute(&pbi->lf_worker);
       }
-// After loopfiltering, the last 7 row pixels in each superblock row may
-// still be changed by the longest loopfilter of the next superblock
-// row.
-#endif  // !CONFIG_PARALLEL_DEBLOCKING
-
-      if (cm->frame_parallel_decode)
-        av1_frameworker_broadcast(pbi->cur_buf, mi_row << MAX_MIB_SIZE_LOG2);
     }
+#endif  // !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING
+
+    // After loopfiltering, the last 7 row pixels in each superblock row may
+    // still be changed by the longest loopfilter of the next superblock row.
+    if (cm->frame_parallel_decode)
+      av1_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
   }
 
-#if CONFIG_ACCOUNTING
-// aom_accounting_dump(&pbi->accounting);
-#endif
-
+#if CONFIG_VAR_TX
+  // Loopfilter the whole frame.
+  av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
+                        cm->lf.filter_level, 0, 0);
+#else
 #if CONFIG_PARALLEL_DEBLOCKING
   // Loopfilter all rows in the frame in the frame.
   if (cm->lf.filter_level && !cm->skip_loop_filter) {
@@ -1627,21 +3185,39 @@
     winterface->execute(&pbi->lf_worker);
   }
 #endif  // CONFIG_PARALLEL_DEBLOCKING
-
-  // Get last tile data.
-  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
-
+#endif  // CONFIG_VAR_TX
   if (cm->frame_parallel_decode)
     av1_frameworker_broadcast(pbi->cur_buf, INT_MAX);
+
+#if CONFIG_EXT_TILE
+  if (n_tiles == 1) {
+#if CONFIG_ANS
+    return data_end;
+#else
+    // Find the end of the single tile buffer
+    return aom_reader_find_end(&pbi->tile_data->bit_reader);
+#endif  // CONFIG_ANS
+  } else {
+    // Return the end of the last tile buffer
+    return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+  }
+#else
 #if CONFIG_ANS
   return data_end;
 #else
-  return aom_reader_find_end(&tile_data->bit_reader);
-#endif
+  {
+    // Get last tile data.
+    TileData *const td = pbi->tile_data + tile_cols * tile_rows - 1;
+    return aom_reader_find_end(&td->bit_reader);
+  }
+#endif  // CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
 }
 
 static int tile_worker_hook(TileWorkerData *const tile_data,
                             const TileInfo *const tile) {
+  AV1Decoder *const pbi = tile_data->pbi;
+  const AV1_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
 
   if (setjmp(tile_data->error_info.jmp)) {
@@ -1653,14 +3229,20 @@
   tile_data->error_info.setjmp = 1;
   tile_data->xd.error_info = &tile_data->error_info;
 
+  av1_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
+
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += MAX_MIB_SIZE) {
-    av1_zero(tile_data->xd.left_context);
-    av1_zero(tile_data->xd.left_seg_context);
+       mi_row += cm->mib_size) {
+    av1_zero_left_context(&tile_data->xd);
+
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MAX_MIB_SIZE) {
-      decode_partition(tile_data->pbi, &tile_data->xd, mi_row, mi_col,
-                       &tile_data->bit_reader, BLOCK_64X64, 4);
+         mi_col += cm->mib_size) {
+      decode_partition(pbi, &tile_data->xd,
+#if CONFIG_SUPERTX
+                       0,
+#endif
+                       mi_row, mi_col, &tile_data->bit_reader, cm->sb_size,
+                       b_width_log2_lookup[cm->sb_size]);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1668,8 +3250,8 @@
 
 // sorts in descending order
 static int compare_tile_buffers(const void *a, const void *b) {
-  const TileBuffer *const buf1 = (const TileBuffer *)a;
-  const TileBuffer *const buf2 = (const TileBuffer *)b;
+  const TileBufferDec *const buf1 = (const TileBufferDec *)a;
+  const TileBufferDec *const buf2 = (const TileBufferDec *)b;
   return (int)(buf2->size - buf1->size);
 }
 
@@ -1677,24 +3259,46 @@
                                       const uint8_t *data_end) {
   AV1_COMMON *const cm = &pbi->common;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
-  const uint8_t *bit_reader_end = NULL;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   const int num_workers = AOMMIN(pbi->max_threads & ~1, tile_cols);
-  TileBuffer tile_buffers[1][1 << 6];
-  int n;
-  int final_worker = -1;
+  TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+#if CONFIG_EXT_TILE
+  const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int tile_rows_start = single_row ? dec_tile_row : 0;
+  const int tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+  const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  const int tile_cols_start = single_col ? dec_tile_col : 0;
+  const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+#else
+  const int tile_rows_start = 0;
+  const int tile_rows_end = tile_rows;
+  const int tile_cols_start = 0;
+  const int tile_cols_end = tile_cols;
+#endif  // CONFIG_EXT_TILE
+  int tile_row, tile_col;
+  int i;
 
-  assert(tile_cols <= (1 << 6));
-  assert(tile_rows == 1);
-  (void)tile_rows;
+#if !(CONFIG_ANS || CONFIG_EXT_TILE)
+  int final_worker = -1;
+#endif  // !(CONFIG_ANS || CONFIG_EXT_TILE)
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+
+  assert(tile_cols * tile_rows > 1);
+
+#if CONFIG_ANS
+  // TODO(any): This might just work now. Needs to be tested.
+  abort();  // FIXME: Tile parsing broken
+#endif      // CONFIG_ANS
 
   // TODO(jzern): See if we can remove the restriction of passing in max
   // threads to the decoder.
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads & ~1;
-    int i;
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
     // Ensure tile data offsets will be properly aligned. This may fail on
@@ -1718,131 +3322,138 @@
   }
 
   // Reset tile decoding hook
-  for (n = 0; n < num_workers; ++n) {
-    AVxWorker *const worker = &pbi->tile_workers[n];
+  for (i = 0; i < num_workers; ++i) {
+    AVxWorker *const worker = &pbi->tile_workers[i];
     winterface->sync(worker);
     worker->hook = (AVxWorkerHook)tile_worker_hook;
-    worker->data1 = &pbi->tile_worker_data[n];
-    worker->data2 = &pbi->tile_worker_info[n];
-  }
-
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-
-  // Load tile data into tile_buffers
-  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
-
-  // Sort the buffers based on size in descending order.
-  qsort(tile_buffers[0], tile_cols, sizeof(tile_buffers[0][0]),
-        compare_tile_buffers);
-
-  // Rearrange the tile buffers such that per-tile group the largest, and
-  // presumably the most difficult, tile will be decoded in the main thread.
-  // This should help minimize the number of instances where the main thread is
-  // waiting for a worker to complete.
-  {
-    int group_start = 0;
-    while (group_start < tile_cols) {
-      const TileBuffer largest = tile_buffers[0][group_start];
-      const int group_end = AOMMIN(group_start + num_workers, tile_cols) - 1;
-      memmove(tile_buffers[0] + group_start, tile_buffers[0] + group_start + 1,
-              (group_end - group_start) * sizeof(tile_buffers[0][0]));
-      tile_buffers[0][group_end] = largest;
-      group_start = group_end + 1;
-    }
+    worker->data1 = &pbi->tile_worker_data[i];
+    worker->data2 = &pbi->tile_worker_info[i];
   }
 
   // Initialize thread frame counts.
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    int i;
-
     for (i = 0; i < num_workers; ++i) {
-      TileWorkerData *const tile_data =
-          (TileWorkerData *)pbi->tile_workers[i].data1;
-      av1_zero(tile_data->counts);
+      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
+      av1_zero(twd->counts);
     }
   }
 
-  n = 0;
-  while (n < tile_cols) {
-    int i;
-    for (i = 0; i < num_workers && n < tile_cols; ++i) {
-      AVxWorker *const worker = &pbi->tile_workers[i];
-      TileWorkerData *const tile_data = (TileWorkerData *)worker->data1;
-      TileInfo *const tile = (TileInfo *)worker->data2;
-      TileBuffer *const buf = &tile_buffers[0][n];
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_buffers);
 
-      tile_data->pbi = pbi;
-      tile_data->xd = pbi->mb;
-      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts =
-          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
-              ? &tile_data->counts
-              : NULL;
-      av1_zero(tile_data->dqcoeff);
-      av1_tile_init(tile, cm, 0, buf->col);
-      av1_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
-      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
-                          &tile_data->bit_reader, pbi->decrypt_cb,
-                          pbi->decrypt_state);
-      av1_init_macroblockd(cm, &tile_data->xd,
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    // Sort the buffers in this tile row based on size in descending order.
+    qsort(&tile_buffers[tile_row][tile_cols_start],
+          tile_cols_end - tile_cols_start, sizeof(tile_buffers[0][0]),
+          compare_tile_buffers);
+
+    // Rearrange the tile buffers in this tile row such that per-tile group
+    // the largest, and presumably the most difficult tile will be decoded in
+    // the main thread. This should help minimize the number of instances
+    // where the main thread is waiting for a worker to complete.
+    {
+      int group_start;
+      for (group_start = tile_cols_start; group_start < tile_cols_end;
+           group_start += num_workers) {
+        const int group_end = AOMMIN(group_start + num_workers, tile_cols);
+        const TileBufferDec largest = tile_buffers[tile_row][group_start];
+        memmove(&tile_buffers[tile_row][group_start],
+                &tile_buffers[tile_row][group_start + 1],
+                (group_end - group_start - 1) * sizeof(tile_buffers[0][0]));
+        tile_buffers[tile_row][group_end - 1] = largest;
+      }
+    }
+
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end;) {
+      // Launch workers for individual columns
+      for (i = 0; i < num_workers && tile_col < tile_cols_end;
+           ++i, ++tile_col) {
+        TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
+        AVxWorker *const worker = &pbi->tile_workers[i];
+        TileWorkerData *const twd = (TileWorkerData *)worker->data1;
+        TileInfo *const tile_info = (TileInfo *)worker->data2;
+
+        twd->pbi = pbi;
+        twd->xd = pbi->mb;
+        twd->xd.corrupted = 0;
+        twd->xd.counts =
+            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD
+                ? &twd->counts
+                : NULL;
+        av1_zero(twd->dqcoeff);
+        av1_tile_init(tile_info, cm, tile_row, buf->col);
+        av1_tile_init(&twd->xd.tile, cm, tile_row, buf->col);
+#if !CONFIG_ANS
+        setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
+                           &twd->bit_reader, pbi->decrypt_cb,
+                           pbi->decrypt_state);
+#else
+        setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
+                            &twd->bit_reader, pbi->decrypt_cb,
+                            pbi->decrypt_state);
+#endif  // CONFIG_ANS
+        av1_init_macroblockd(cm, &twd->xd,
 #if CONFIG_PVQ
-                           tile_data->pvq_ref_coeff,
+                             twd->pvq_ref_coeff,
 #endif
-                           tile_data->dqcoeff);
+                             twd->dqcoeff);
 #if CONFIG_PVQ
-      daala_dec_init(&tile_data->xd.daala_dec, &tile_data->bit_reader.ec);
+        daala_dec_init(&twd->xd.daala_dec, &twd->bit_reader.ec);
 #endif
 #if CONFIG_PALETTE
-      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
-      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
+        twd->xd.plane[0].color_index_map = twd->color_index_map[0];
+        twd->xd.plane[1].color_index_map = twd->color_index_map[1];
 #endif  // CONFIG_PALETTE
 
-      worker->had_error = 0;
-      if (i == num_workers - 1 || n == tile_cols - 1) {
-        winterface->execute(worker);
-      } else {
-        winterface->launch(worker);
+        worker->had_error = 0;
+        if (i == num_workers - 1 || tile_col == tile_cols_end - 1) {
+          winterface->execute(worker);
+        } else {
+          winterface->launch(worker);
+        }
+
+#if !(CONFIG_ANS || CONFIG_EXT_TILE)
+        if (tile_row == tile_rows - 1 && buf->col == tile_cols - 1) {
+          final_worker = i;
+        }
+#endif  // !(CONFIG_ANS || CONFIG_EXT_TILE)
       }
 
-      if (buf->col == tile_cols - 1) {
-        final_worker = i;
-      }
-
-      ++n;
-    }
-
-    for (; i > 0; --i) {
-      AVxWorker *const worker = &pbi->tile_workers[i - 1];
-      // TODO(jzern): The tile may have specific error data associated with
-      // its aom_internal_error_info which could be propagated to the main info
-      // in cm. Additionally once the threads have been synced and an error is
-      // detected, there's no point in continuing to decode tiles.
-      pbi->mb.corrupted |= !winterface->sync(worker);
-    }
-    if (final_worker > -1) {
-      TileWorkerData *const tile_data =
-          (TileWorkerData *)pbi->tile_workers[final_worker].data1;
-      bit_reader_end = aom_reader_find_end(&tile_data->bit_reader);
-      final_worker = -1;
-    }
-
-    // Accumulate thread frame counts.
-    if (n >= tile_cols &&
-        cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-      for (i = 0; i < num_workers; ++i) {
-        TileWorkerData *const tile_data =
-            (TileWorkerData *)pbi->tile_workers[i].data1;
-        av1_accumulate_frame_counts(cm, &tile_data->counts, 1);
+      // Sync all workers
+      for (; i > 0; --i) {
+        AVxWorker *const worker = &pbi->tile_workers[i - 1];
+        // TODO(jzern): The tile may have specific error data associated with
+        // its aom_internal_error_info which could be propagated to the main
+        // info in cm. Additionally once the threads have been synced and an
+        // error is detected, there's no point in continuing to decode tiles.
+        pbi->mb.corrupted |= !winterface->sync(worker);
       }
     }
   }
 
-  return bit_reader_end;
+  // Accumulate thread frame counts.
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const twd = (TileWorkerData *)pbi->tile_workers[i].data1;
+      av1_accumulate_frame_counts(cm, &twd->counts);
+    }
+  }
+
+#if CONFIG_EXT_TILE
+  // Return the end of the last tile buffer
+  return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+#else
+#if CONFIG_ANS
+  return data_end;
+#else
+  assert(final_worker != -1);
+  {
+    TileWorkerData *const twd =
+        (TileWorkerData *)pbi->tile_workers[final_worker].data1;
+    return aom_reader_find_end(&twd->bit_reader);
+  }
+#endif  // CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
 }
 
 static void error_handler(void *data) {
@@ -1902,7 +3513,6 @@
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
   int i, mask, ref_index = 0;
   size_t sz;
-
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
@@ -1927,9 +3537,11 @@
 #endif
 
   cm->show_existing_frame = aom_rb_read_bit(rb);
+
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[aom_rb_read_literal(rb, 3)];
+
     lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
       unlock_buffer_pool(pool);
@@ -1937,68 +3549,18 @@
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
     }
-
     ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
     unlock_buffer_pool(pool);
+
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
-
-#if CONFIG_EXT_REFS
-    // NOTE: The existing frame to show is adopted as a reference frame.
-    pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
-
-    for (i = 0; i < REFS_PER_FRAME; ++i) {
-      const int ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
-      const int idx = cm->ref_frame_map[ref];
-      RefBuffer *const ref_frame = &cm->frame_refs[i];
-      ref_frame->idx = idx;
-      ref_frame->buf = &frame_bufs[idx].buf;
-      cm->ref_frame_sign_bias[LAST_FRAME + i] = aom_rb_read_bit(rb);
-    }
-
-    for (i = 0; i < REFS_PER_FRAME; ++i) {
-      RefBuffer *const ref_buf = &cm->frame_refs[i];
-#if CONFIG_AOM_HIGHBITDEPTH
-      av1_setup_scale_factors_for_frame(
-          &ref_buf->sf, ref_buf->buf->y_crop_width, ref_buf->buf->y_crop_height,
-          cm->width, cm->height, cm->use_highbitdepth);
-#else   // CONFIG_AOM_HIGHBITDEPTH
-      av1_setup_scale_factors_for_frame(
-          &ref_buf->sf, ref_buf->buf->y_crop_width, ref_buf->buf->y_crop_height,
-          cm->width, cm->height);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-    }
-
-    // Generate next_ref_frame_map.
-    lock_buffer_pool(pool);
-    for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-      if (mask & 1) {
-        cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
-        ++frame_bufs[cm->new_fb_idx].ref_count;
-      } else {
-        cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
-      }
-      // Current thread holds the reference frame.
-      if (cm->ref_frame_map[ref_index] >= 0)
-        ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
-      ++ref_index;
-    }
-
-    for (; ref_index < REF_FRAMES; ++ref_index) {
-      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
-      // Current thread holds the reference frame.
-      if (cm->ref_frame_map[ref_index] >= 0)
-        ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
-    }
-    unlock_buffer_pool(pool);
-    pbi->hold_ref_buf = 1;
-#else
     pbi->refresh_frame_flags = 0;
+
     if (cm->frame_parallel_decode) {
       for (i = 0; i < REF_FRAMES; ++i)
         cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
     }
-#endif  // CONFIG_EXT_REFS
+
     return 0;
   }
 
@@ -2014,7 +3576,7 @@
     read_bitdepth_colorspace_sampling(cm, rb);
     pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
-    for (i = 0; i < REFS_PER_FRAME; ++i) {
+    for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
       cm->frame_refs[i].idx = INVALID_IDX;
       cm->frame_refs[i].buf = NULL;
     }
@@ -2074,7 +3636,7 @@
       }
 #endif  // CONFIG_EXT_REFS
 
-      for (i = 0; i < REFS_PER_FRAME; ++i) {
+      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
         const int ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
         RefBuffer *const ref_frame = &cm->frame_refs[i];
@@ -2096,7 +3658,7 @@
       cm->allow_high_precision_mv = aom_rb_read_bit(rb);
       cm->interp_filter = read_interp_filter(rb);
 
-      for (i = 0; i < REFS_PER_FRAME; ++i) {
+      for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
         RefBuffer *const ref_buf = &cm->frame_refs[i];
 #if CONFIG_AOM_HIGHBITDEPTH
         av1_setup_scale_factors_for_frame(
@@ -2128,14 +3690,9 @@
   if (!cm->error_resilient_mode) {
     cm->refresh_frame_context = aom_rb_read_bit(rb)
                                     ? REFRESH_FRAME_CONTEXT_FORWARD
-                                    : REFRESH_FRAME_CONTEXT_OFF;
-    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
-      cm->refresh_frame_context = aom_rb_read_bit(rb)
-                                      ? REFRESH_FRAME_CONTEXT_FORWARD
-                                      : REFRESH_FRAME_CONTEXT_BACKWARD;
-    }
+                                    : REFRESH_FRAME_CONTEXT_BACKWARD;
   } else {
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
   }
 
   // This flag will be overridden by the call to av1_setup_past_independence
@@ -2159,6 +3716,7 @@
 
   for (; ref_index < REF_FRAMES; ++ref_index) {
     cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
     // Current thread holds the reference frame.
     if (cm->ref_frame_map[ref_index] >= 0)
       ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
@@ -2169,18 +3727,37 @@
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     av1_setup_past_independence(cm);
 
-  setup_loopfilter(&cm->lf, rb);
+#if CONFIG_EXT_PARTITION
+  set_sb_size(cm, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+#else
+  set_sb_size(cm, BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
+  setup_loopfilter(cm, rb);
 #if CONFIG_DERING
   setup_dering(cm, rb);
 #endif
 #if CONFIG_CLPF
   setup_clpf(pbi, rb);
 #endif
+#if CONFIG_LOOP_RESTORATION
+  decode_restoration_mode(cm, rb);
+#endif  // CONFIG_LOOP_RESTORATION
   setup_quantization(cm, rb);
 #if CONFIG_AOM_HIGHBITDEPTH
   xd->bd = (int)cm->bit_depth;
 #endif
 
+#if CONFIG_ENTROPY
+  av1_default_coef_probs(cm);
+  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+    for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+  }
+#endif  // CONFIG_ENTROPY
+
   setup_segmentation(cm, rb);
 
 #if CONFIG_DELTA_Q
@@ -2212,6 +3789,7 @@
                            : cm->base_qindex;
     xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
+    xd->qindex[i] = qindex;
   }
 
   setup_segmentation_dequant(cm);
@@ -2219,7 +3797,7 @@
       (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4 : read_tx_mode(rb);
   cm->reference_mode = read_frame_reference_mode(cm, rb);
 
-  setup_tile_info(pbi, rb);
+  read_tile_info(pbi, rb);
   sz = aom_rb_read_literal(rb, 16);
 
   if (sz == 0)
@@ -2228,24 +3806,151 @@
   return sz;
 }
 
+#if CONFIG_EXT_TX
+#if !CONFIG_EC_ADAPT || !CONFIG_DAALA_EC
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j, k;
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < num_ext_tx_set_inter[s] - 1; ++j)
+          av1_diff_update_prob(r, &fc->inter_ext_tx_prob[s][i][j], ACCT_STR);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          for (k = 0; k < num_ext_tx_set_intra[s] - 1; ++k)
+            av1_diff_update_prob(r, &fc->intra_ext_tx_prob[s][i][j][k],
+                                 ACCT_STR);
+      }
+    }
+  }
+}
+#endif  // !CONFIG_EC_ADAPT || !CONFIG_DAALA_EC
+#else
+
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+static void read_supertx_probs(FRAME_CONTEXT *fc, aom_reader *r) {
+  int i, j;
+  if (aom_read(r, GROUP_DIFF_UPDATE_PROB, ACCT_STR)) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = 1; j < TX_SIZES; ++j) {
+        av1_diff_update_prob(r, &fc->supertx_prob[i][j], ACCT_STR);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_GLOBAL_MOTION
+static void read_global_motion_params(Global_Motion_Params *params,
+                                      aom_prob *probs, aom_reader *r) {
+  GLOBAL_MOTION_TYPE gmtype =
+      aom_read_tree(r, av1_global_motion_types_tree, probs, ACCT_STR);
+  params->gmtype = gmtype;
+  params->motion_params.wmtype = gm_to_trans_type(gmtype);
+  switch (gmtype) {
+    case GLOBAL_ZERO: break;
+    case GLOBAL_AFFINE:
+      params->motion_params.wmmat[4] =
+          (aom_read_primitive_symmetric(r, GM_ABS_ALPHA_BITS) *
+           GM_ALPHA_DECODE_FACTOR);
+      params->motion_params.wmmat[5] =
+          aom_read_primitive_symmetric(r, GM_ABS_ALPHA_BITS) *
+              GM_ALPHA_DECODE_FACTOR +
+          (1 << WARPEDMODEL_PREC_BITS);
+    // fallthrough intended
+    case GLOBAL_ROTZOOM:
+      params->motion_params.wmmat[2] =
+          aom_read_primitive_symmetric(r, GM_ABS_ALPHA_BITS) *
+          GM_ALPHA_DECODE_FACTOR;
+      params->motion_params.wmmat[3] =
+          (aom_read_primitive_symmetric(r, GM_ABS_ALPHA_BITS) *
+           GM_ALPHA_DECODE_FACTOR) +
+          (1 << WARPEDMODEL_PREC_BITS);
+    // fallthrough intended
+    case GLOBAL_TRANSLATION:
+      params->motion_params.wmmat[0] =
+          aom_read_primitive_symmetric(r, GM_ABS_TRANS_BITS) *
+          GM_TRANS_DECODE_FACTOR;
+      params->motion_params.wmmat[1] =
+          aom_read_primitive_symmetric(r, GM_ABS_TRANS_BITS) *
+          GM_TRANS_DECODE_FACTOR;
+      break;
+    default: assert(0);
+  }
+}
+
+static void read_global_motion(AV1_COMMON *cm, aom_reader *r) {
+  int frame;
+  memset(cm->global_motion, 0, sizeof(cm->global_motion));
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    read_global_motion_params(&cm->global_motion[frame],
+                              cm->fc->global_motion_types_prob, r);
+    /*
+          printf("Dec Ref %d [%d]: %d %d %d %d\n",
+                 frame, cm->current_video_frame,
+                 cm->global_motion[frame].motion_params.wmmat[0].as_mv.row,
+                 cm->global_motion[frame].motion_params.wmmat[0].as_mv.col,
+                 cm->global_motion[frame].motion_params.wmmat[1].as_mv.row,
+                 cm->global_motion[frame].motion_params.wmmat[1].as_mv.col);
+    */
+  }
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
 static int read_compressed_header(AV1Decoder *pbi, const uint8_t *data,
                                   size_t partition_size) {
   AV1_COMMON *const cm = &pbi->common;
+#if CONFIG_SUPERTX
+  MACROBLOCKD *const xd = &pbi->mb;
+#endif
   FRAME_CONTEXT *const fc = cm->fc;
   aom_reader r;
-  int k, i, j;
+  int k, i;
+#if !CONFIG_EC_ADAPT
+  int j;
+#endif
 
+#if !CONFIG_ANS
   if (aom_reader_init(&r, data, partition_size, pbi->decrypt_cb,
                       pbi->decrypt_state))
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
+#else
+  if (ans_read_init(&r, data, (int)partition_size))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate compressed header ANS decoder");
+#endif  // !CONFIG_ANS
 
-  if (cm->tx_mode == TX_MODE_SELECT) read_tx_mode_probs(&fc->tx_probs, &r);
+#if CONFIG_LOOP_RESTORATION
+  decode_restoration(cm, &r);
+#endif
+
+  if (cm->tx_mode == TX_MODE_SELECT) read_tx_size_probs(fc, &r);
 
 #if !CONFIG_PVQ
   read_coef_probs(fc, cm->tx_mode, &r);
-#endif
 
+#if CONFIG_VAR_TX
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    av1_diff_update_prob(&r, &fc->txfm_partition_prob[k], ACCT_STR);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 1; i < MAX_TX_DEPTH; ++i)
+      av1_diff_update_prob(&r, &fc->rect_tx_prob[i], ACCT_STR);
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+#endif  // CONFIG_VAR_TX
+#endif  // !CONFIG_PVQ
   for (k = 0; k < SKIP_CONTEXTS; ++k)
     av1_diff_update_prob(&r, &fc->skip_probs[k], ACCT_STR);
 
@@ -2269,10 +3974,23 @@
       av1_diff_update_prob(&r, &fc->uv_mode_prob[j][i], ACCT_STR);
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  for (i = 0; i < PARTITION_TYPES - 1; ++i)
+    av1_diff_update_prob(&r, &fc->partition_prob[0][i], ACCT_STR);
+  for (j = 1; j < PARTITION_CONTEXTS; ++j)
+    for (i = 0; i < EXT_PARTITION_TYPES - 1; ++i)
+      av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
+#else
   for (j = 0; j < PARTITION_CONTEXTS; ++j)
     for (i = 0; i < PARTITION_TYPES - 1; ++i)
       av1_diff_update_prob(&r, &fc->partition_prob[j][i], ACCT_STR);
-#endif
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // EC_ADAPT, DAALA_EC
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    for (j = 0; j < INTRA_FILTERS - 1; ++j)
+      av1_diff_update_prob(&r, &fc->intra_filter_probs[i][j], ACCT_STR);
+#endif  // EC_ADAPT, DAALA_EC
 
   if (frame_is_intra_only(cm)) {
     av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
@@ -2291,13 +4009,39 @@
 #endif
     read_inter_mode_probs(fc, &r);
 
-#if CONFIG_MOTION_VAR
-    for (j = 0; j < BLOCK_SIZES; ++j)
-      if (is_motion_variation_allowed_bsize(j)) {
-        for (i = 0; i < MOTION_MODES - 1; ++i)
-          av1_diff_update_prob(&r, &fc->motion_mode_prob[j][i], ACCT_STR);
+#if CONFIG_EXT_INTER
+    read_inter_compound_mode_probs(fc, &r);
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
+          av1_diff_update_prob(&r, &fc->interintra_prob[i], ACCT_STR);
+        }
       }
-#endif  // CONFIG_MOTION_VAR
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        for (j = 0; j < INTERINTRA_MODES - 1; j++)
+          av1_diff_update_prob(&r, &fc->interintra_mode_prob[i][j], ACCT_STR);
+      }
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) {
+          av1_diff_update_prob(&r, &fc->wedge_interintra_prob[i], ACCT_STR);
+        }
+      }
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interinter_wedge_used(i)) {
+          av1_diff_update_prob(&r, &fc->wedge_interinter_prob[i], ACCT_STR);
+        }
+      }
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i) {
+      for (j = 0; j < MOTION_MODES - 1; ++j)
+        av1_diff_update_prob(&r, &fc->motion_mode_prob[i][j], ACCT_STR);
+    }
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
 #if !CONFIG_EC_ADAPT
     if (cm->interp_filter == SWITCHABLE) read_switchable_interp_probs(fc, &r);
@@ -2325,7 +4069,13 @@
 #endif
 #if !CONFIG_EC_ADAPT
     read_ext_tx_probs(fc, &r);
+#endif  // EC_ADAPT, DAALA_EC
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0]) read_supertx_probs(fc, &r);
 #endif
+#if CONFIG_GLOBAL_MOTION
+    read_global_motion(cm, &r);
+#endif  // EC_ADAPT, DAALA_EC
   }
 #if CONFIG_EC_MULTISYMBOL
   av1_coef_pareto_cdfs(fc);
@@ -2361,26 +4111,35 @@
                  sizeof(cm->counts.switchable_interp)));
   assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode,
                  sizeof(cm->counts.inter_mode)));
-#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+  assert(!memcmp(cm->counts.inter_compound_mode,
+                 zero_counts.inter_compound_mode,
+                 sizeof(cm->counts.inter_compound_mode)));
+  assert(!memcmp(cm->counts.interintra, zero_counts.interintra,
+                 sizeof(cm->counts.interintra)));
+  assert(!memcmp(cm->counts.wedge_interintra, zero_counts.wedge_interintra,
+                 sizeof(cm->counts.wedge_interintra)));
+  assert(!memcmp(cm->counts.wedge_interinter, zero_counts.wedge_interinter,
+                 sizeof(cm->counts.wedge_interinter)));
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   assert(!memcmp(cm->counts.motion_mode, zero_counts.motion_mode,
                  sizeof(cm->counts.motion_mode)));
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
                  sizeof(cm->counts.intra_inter)));
   assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
                  sizeof(cm->counts.comp_inter)));
   assert(!memcmp(cm->counts.single_ref, zero_counts.single_ref,
                  sizeof(cm->counts.single_ref)));
-#if CONFIG_EXT_REFS
-  assert(!memcmp(cm->counts.comp_fwdref, zero_counts.comp_fwdref,
-                 sizeof(cm->counts.comp_fwdref)));
-  assert(!memcmp(cm->counts.comp_bwdref, zero_counts.comp_bwdref,
-                 sizeof(cm->counts.comp_bwdref)));
-#else
   assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
                  sizeof(cm->counts.comp_ref)));
+#if CONFIG_EXT_REFS
+  assert(!memcmp(cm->counts.comp_bwdref, zero_counts.comp_bwdref,
+                 sizeof(cm->counts.comp_bwdref)));
 #endif  // CONFIG_EXT_REFS
-  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
+  assert(!memcmp(&cm->counts.tx_size, &zero_counts.tx_size,
+                 sizeof(cm->counts.tx_size)));
   assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
 #if CONFIG_REF_MV
   assert(
@@ -2390,10 +4149,10 @@
 #else
   assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
 #endif
-  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
-                 sizeof(cm->counts.intra_ext_tx)));
   assert(!memcmp(cm->counts.inter_ext_tx, zero_counts.inter_ext_tx,
                  sizeof(cm->counts.inter_ext_tx)));
+  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
+                 sizeof(cm->counts.intra_ext_tx)));
 }
 #endif  // NDEBUG
 
@@ -2435,7 +4194,7 @@
   if (profile > 2) profile += aom_rb_read_bit(rb);
   return (BITSTREAM_PROFILE)profile;
 }
-
+#if CONFIG_TILE_GROUPS
 static int read_all_headers(AV1Decoder *pbi, struct aom_read_bit_buffer *rb,
                             const uint8_t **p_data,
                             const uint8_t **p_data_end) {
@@ -2445,6 +4204,9 @@
 
   pbi->first_partition_size = read_uncompressed_header(pbi, rb);
   pbi->uncomp_hdr_size = aom_rb_bytes_read(rb);
+#if CONFIG_GLOBAL_MOTION
+  xd->global_motion = cm->global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
 
   if (!pbi->first_partition_size) {
 // showing a frame directly
@@ -2481,7 +4243,7 @@
 
   return 0;
 }
-
+#endif
 void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
                       const uint8_t *data_end, const uint8_t **p_data_end) {
   AV1_COMMON *const cm = &pbi->common;
@@ -2489,21 +4251,37 @@
   struct aom_read_bit_buffer rb;
   int context_updated = 0;
   uint8_t clear_data[MAX_AV1_HEADER_SIZE];
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  int early_terminate;
-  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
-  xd->cur_buf = new_fb;
+  size_t first_partition_size;
+  YV12_BUFFER_CONFIG *new_fb;
 
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
 #endif
 
-  early_terminate = read_all_headers(
-      pbi, init_read_bit_buffer(pbi, &rb, data, data_end, clear_data), &data,
-      &data_end);
+  first_partition_size = read_uncompressed_header(
+      pbi, init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
+  new_fb = get_frame_new_buffer(cm);
+  xd->cur_buf = new_fb;
+#if CONFIG_GLOBAL_MOTION
+  xd->global_motion = cm->global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
 
-  if (early_terminate) return;
+  if (!first_partition_size) {
+// showing a frame directly
+#if CONFIG_EXT_REFS
+    if (cm->show_existing_frame)
+      *p_data_end = data + aom_rb_bytes_read(&rb);
+    else
+#endif  // CONFIG_EXT_REFS
+      *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
+
+    return;
+  }
+
+  data += aom_rb_bytes_read(&rb);
+  if (!read_is_valid(data, first_partition_size, data_end))
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt header length");
 
 #if CONFIG_SIMP_MV_PRED
   cm->setup_mi(cm);
@@ -2533,6 +4311,19 @@
 
   av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
+  av1_zero(cm->counts);
+
+  xd->corrupted = 0;
+  new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+  if (new_fb->corrupted)
+    aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data header is corrupted.");
+
   if (cm->lf.filter_level && !cm->skip_loop_filter) {
     av1_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
@@ -2556,9 +4347,18 @@
     av1_frameworker_unlock_stats(worker);
   }
 
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+#if CONFIG_ENTROPY
+  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  cm->coef_probs_update_idx = 0;
+#endif  // CONFIG_ENTROPY
+
+  if (pbi->max_threads > 1
+#if CONFIG_EXT_TILE
+      && pbi->dec_tile_col < 0  // Decoding all columns
+#endif                          // CONFIG_EXT_TILE
+      && cm->tile_cols > 1) {
     // Multi-threaded tile decoder
-    *p_data_end = decode_tiles_mt(pbi, data, data_end);
+    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
     if (!xd->corrupted) {
       if (!cm->skip_loop_filter) {
         // If multiple threads are used to decode tiles, then we use those
@@ -2572,8 +4372,16 @@
                          "Decode failed. Frame data is corrupted.");
     }
   } else {
-    *p_data_end = decode_tiles(pbi, data, data_end);
+    *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
+#if CONFIG_LOOP_RESTORATION
+  if (cm->rst_info.restoration_type != RESTORE_NONE) {
+    av1_loop_restoration_init(&cm->rst_internal, &cm->rst_info,
+                              cm->frame_type == KEY_FRAME, cm->width,
+                              cm->height);
+    av1_loop_restoration_rows(new_fb, cm, 0, cm->mi_rows, 0);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
 
 #if CONFIG_DERING
   if (cm->dering_level && !cm->skip_loop_filter) {
@@ -2605,6 +4413,9 @@
 
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_ENTROPY
+      cm->partial_prob_update = 0;
+#endif  // CONFIG_ENTROPY
       av1_adapt_coef_probs(cm);
       av1_adapt_intra_frame_probs(cm);
 
@@ -2621,7 +4432,6 @@
   }
 
   // Non frame parallel update frame context here.
-  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF &&
-      !context_updated)
+  if (!cm->error_resilient_mode && !context_updated)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 }

diff --git a/av1/decoder/decodemv.c b/av1/decoder/decodemv.c
index 02e8945..a318c2f 100644
--- a/av1/decoder/decodemv.c
+++ b/av1/decoder/decodemv.c

@@ -26,6 +26,20 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 #define ACCT_STR __func__
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE int read_uniform(aom_reader *r, int n) {
+  int l = get_unsigned_bits(n);
+  int m = (1 << l) - n;
+  int v = aom_read_literal(r, l - 1, ACCT_STR);
+
+  assert(l != 0);
+
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
+}
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
 
 #if CONFIG_DAALA_EC
 static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
@@ -101,7 +115,22 @@
   return uv_mode;
 }
 
+#if CONFIG_EXT_INTER
+static INTERINTRA_MODE read_interintra_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                            aom_reader *r, int size_group) {
+  const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_tree(
+      r, av1_interintra_mode_tree, cm->fc->interintra_mode_prob[size_group],
+      ACCT_STR);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts) ++counts->interintra_mode[size_group][ii_mode];
+  return ii_mode;
+}
+#endif  // CONFIG_EXT_INTER
+
 static PREDICTION_MODE read_inter_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                       MB_MODE_INFO *mbmi,
+#endif
                                        aom_reader *r, int16_t ctx) {
 #if CONFIG_REF_MV
   FRAME_COUNTS *counts = xd->counts;
@@ -110,7 +139,23 @@
 
   if (aom_read(r, mode_prob, ACCT_STR) == 0) {
     if (counts) ++counts->newmv_mode[mode_ctx][0];
-    return NEWMV;
+
+#if CONFIG_EXT_INTER
+    if (has_second_ref(mbmi)) {
+#endif  // CONFIG_EXT_INTER
+      return NEWMV;
+#if CONFIG_EXT_INTER
+    } else {
+      mode_prob = cm->fc->new2mv_prob;
+      if (aom_read(r, mode_prob, ACCT_STR) == 0) {
+        if (counts) ++counts->new2mv_mode[0];
+        return NEWMV;
+      } else {
+        if (counts) ++counts->new2mv_mode[1];
+        return NEWFROMNEARMV;
+      }
+    }
+#endif  // CONFIG_EXT_INTER
   }
   if (counts) ++counts->newmv_mode[mode_ctx][1];
 
@@ -132,8 +177,10 @@
   if (ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
 
   mode_prob = cm->fc->refmv_prob[mode_ctx];
+
   if (aom_read(r, mode_prob, ACCT_STR) == 0) {
     if (counts) ++counts->refmv_mode[mode_ctx][0];
+
     return NEARESTMV;
   } else {
     if (counts) ++counts->refmv_mode[mode_ctx][1];
@@ -202,7 +249,7 @@
 }
 #endif
 
-#if CONFIG_MOTION_VAR
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
                                     MB_MODE_INFO *mbmi, aom_reader *r) {
   if (is_motion_variation_allowed(mbmi)) {
@@ -218,7 +265,22 @@
     return SIMPLE_TRANSLATION;
   }
 }
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+static PREDICTION_MODE read_inter_compound_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                                aom_reader *r, int16_t ctx) {
+  const int mode =
+      aom_read_tree(r, av1_inter_compound_mode_tree,
+                    cm->fc->inter_compound_mode_probs[ctx], ACCT_STR);
+  FRAME_COUNTS *counts = xd->counts;
+
+  if (counts) ++counts->inter_compound_mode[ctx][mode];
+
+  assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
+  return NEAREST_NEARESTMV + mode;
+}
+#endif  // CONFIG_EXT_INTER
 
 static int read_segment_id(aom_reader *r, struct segmentation_probs *segp) {
 #if CONFIG_DAALA_EC
@@ -228,32 +290,137 @@
 #endif
 }
 
-static TX_SIZE read_selected_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd,
-                                     TX_SIZE max_tx_size, aom_reader *r) {
-  FRAME_COUNTS *counts = xd->counts;
-  const int ctx = get_tx_size_context(xd);
-  const aom_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
-  TX_SIZE tx_size = aom_read(r, tx_probs[TX_4X4], ACCT_STR) ? TX_8X8 : TX_4X4;
-  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    tx_size += aom_read(r, tx_probs[TX_8X8], ACCT_STR);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      tx_size += aom_read(r, tx_probs[TX_16X16], ACCT_STR);
+#if CONFIG_VAR_TX
+static void read_tx_size_vartx(AV1_COMMON *cm, MACROBLOCKD *xd,
+                               MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
+                               TX_SIZE tx_size, int depth, int blk_row,
+                               int blk_col, aom_reader *r) {
+  int is_split = 0;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
+  TX_SIZE(*const inter_tx_size)
+  [MAX_MIB_SIZE] =
+      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    int idx, idy;
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
+    if (counts) ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+    return;
   }
 
-  if (counts) ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
+  is_split = aom_read(r, cm->fc->txfm_partition_prob[ctx], ACCT_STR);
+
+  if (is_split) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    if (counts) ++counts->txfm_partition[ctx][1];
+
+    if (tx_size == TX_8X8) {
+      inter_tx_size[0][0] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+      read_tx_size_vartx(cm, xd, mbmi, counts, sub_txs, depth + 1, offsetr,
+                         offsetc, r);
+    }
+  } else {
+    int idx, idy;
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    mbmi->min_tx_size = AOMMIN(mbmi->min_tx_size, get_min_tx_size(tx_size));
+    if (counts) ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+  }
+}
+#endif
+
+static TX_SIZE read_selected_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                     int tx_size_cat, aom_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int ctx = get_tx_size_context(xd);
+  int depth = aom_read_tree(r, av1_tx_size_tree[tx_size_cat],
+                            cm->fc->tx_size_probs[tx_size_cat][ctx], ACCT_STR);
+  TX_SIZE tx_size = depth_to_tx_size(depth);
+  if (counts) ++counts->tx_size[tx_size_cat][ctx][depth];
   return tx_size;
 }
 
-static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int allow_select,
-                            aom_reader *r) {
+static TX_SIZE read_tx_size_intra(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                  aom_reader *r) {
   TX_MODE tx_mode = cm->tx_mode;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4;
-  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
-    return read_selected_tx_size(cm, xd, max_tx_size, r);
-  else
-    return AOMMIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
+  if (bsize >= BLOCK_8X8) {
+    if (tx_mode == TX_MODE_SELECT) {
+      const TX_SIZE tx_size =
+          read_selected_tx_size(cm, xd, intra_tx_size_cat_lookup[bsize], r);
+      assert(tx_size <= max_txsize_lookup[bsize]);
+      return tx_size;
+    } else {
+      return tx_size_from_tx_mode(bsize, cm->tx_mode, 0);
+    }
+  } else {
+    return TX_4X4;
+  }
+}
+
+static TX_SIZE read_tx_size_inter(AV1_COMMON *cm, MACROBLOCKD *xd,
+                                  int allow_select, aom_reader *r) {
+  TX_MODE tx_mode = cm->tx_mode;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) return TX_4X4;
+  if (bsize >= BLOCK_8X8) {
+    if (allow_select && tx_mode == TX_MODE_SELECT) {
+      const TX_SIZE coded_tx_size =
+          read_selected_tx_size(cm, xd, inter_tx_size_cat_lookup[bsize], r);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      if (coded_tx_size > max_txsize_lookup[bsize]) {
+        assert(coded_tx_size == max_txsize_lookup[bsize] + 1);
+        return max_txsize_rect_lookup[bsize];
+      }
+#else
+      assert(coded_tx_size <= max_txsize_lookup[bsize]);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      return coded_tx_size;
+    } else {
+      return tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+    }
+  } else {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+    return max_txsize_rect_lookup[bsize];
+#else
+    return TX_4X4;
+#endif
+  }
 }
 
 static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
@@ -319,8 +486,8 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int predicted_segment_id, segment_id;
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = xd->plane[0].n4_w >> 1;
-  const int bh = xd->plane[0].n4_h >> 1;
+  const int bw = num_8x8_blocks_wide_lookup[mbmi->sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mbmi->sb_type];
 
   // TODO(slavarnway): move x_mis, y_mis into xd ?????
   const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
@@ -371,17 +538,6 @@
   }
 }
 
-#if CONFIG_EXT_INTRA || CONFIG_PALETTE
-static INLINE int read_uniform(aom_reader *r, int n) {
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
-  const int v = aom_read_literal(r, l - 1, ACCT_STR);
-
-  assert(l != 0);
-  return (v < m) ? v : ((v << 1) - m + aom_read_literal(r, 1, ACCT_STR));
-}
-#endif  // CONFIG_EXT_INTRA || CONFIG_PALETTE
-
 #if CONFIG_PALETTE
 static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
                                    aom_reader *r) {
@@ -409,10 +565,12 @@
       n = pmi->palette_size[0];
       for (i = 0; i < n; ++i)
         pmi->palette_colors[i] = aom_read_literal(r, cm->bit_depth, ACCT_STR);
+
       xd->plane[0].color_index_map[0] = read_uniform(r, n);
       assert(xd->plane[0].color_index_map[0] < n);
     }
   }
+
   if (mbmi->uv_mode == DC_PRED) {
     if (aom_read(r, av1_default_palette_uv_mode_prob[pmi->palette_size[0] > 0],
                  ACCT_STR)) {
@@ -435,26 +593,162 @@
 }
 #endif  // CONFIG_PALETTE
 
-#if CONFIG_EXT_INTRA
-static void read_intra_angle_info(MB_MODE_INFO *const mbmi, aom_reader *r) {
-  mbmi->intra_angle_delta[0] = 0;
-  mbmi->intra_angle_delta[1] = 0;
-  if (mbmi->sb_type < BLOCK_8X8) return;
+#if CONFIG_FILTER_INTRA
+static void read_filter_intra_mode_info(AV1_COMMON *const cm,
+                                        MACROBLOCKD *const xd, aom_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  FRAME_COUNTS *counts = xd->counts;
+  FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+      &mbmi->filter_intra_mode_info;
 
-  if (is_directional_mode(mbmi->mode)) {
-    const TX_SIZE max_tx_size = max_txsize_lookup[mbmi->sb_type];
-    const int max_angle_delta = av1_max_angle_delta_y[max_tx_size][mbmi->mode];
-    mbmi->intra_angle_delta[0] =
-        read_uniform(r, 2 * max_angle_delta + 1) - max_angle_delta;
+  if (mbmi->mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[0] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    filter_intra_mode_info->use_filter_intra_mode[0] =
+        aom_read(r, cm->fc->filter_intra_probs[0], ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra_mode[0]) {
+      filter_intra_mode_info->filter_intra_mode[0] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts) {
+      ++counts->filter_intra[0]
+                            [filter_intra_mode_info->use_filter_intra_mode[0]];
+    }
+  }
+  if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[1] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    filter_intra_mode_info->use_filter_intra_mode[1] =
+        aom_read(r, cm->fc->filter_intra_probs[1], ACCT_STR);
+    if (filter_intra_mode_info->use_filter_intra_mode[1]) {
+      filter_intra_mode_info->filter_intra_mode[1] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts) {
+      ++counts->filter_intra[1]
+                            [filter_intra_mode_info->use_filter_intra_mode[1]];
+    }
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+static void read_intra_angle_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+                                  aom_reader *r) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int ctx = av1_get_pred_context_intra_interp(xd);
+  int p_angle;
+
+  if (bsize < BLOCK_8X8) return;
+
+  if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) {
+    mbmi->angle_delta[0] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+    if (av1_is_intra_filter_switchable(p_angle)) {
+      FRAME_COUNTS *counts = xd->counts;
+      mbmi->intra_filter = aom_read_tree(
+          r, av1_intra_filter_tree, cm->fc->intra_filter_probs[ctx], ACCT_STR);
+      if (counts) ++counts->intra_filter[ctx][mbmi->intra_filter];
+    } else {
+      mbmi->intra_filter = INTRA_FILTER_LINEAR;
+    }
   }
 
-  if (is_directional_mode(mbmi->uv_mode)) {
-    mbmi->intra_angle_delta[1] =
-        read_uniform(r, 2 * MAX_ANGLE_DELTA_UV + 1) - MAX_ANGLE_DELTA_UV;
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+    mbmi->angle_delta[1] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
   }
 }
 #endif  // CONFIG_EXT_INTRA
 
+static void read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                         MB_MODE_INFO *mbmi,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
+                         aom_reader *r) {
+  const int inter_block = is_inter_block(mbmi);
+#if CONFIG_VAR_TX
+  const TX_SIZE tx_size = inter_block ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+  const TX_SIZE tx_size = mbmi->tx_size;
+#endif
+  if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(tx_size, mbmi->sb_type, inter_block) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int eset = get_ext_tx_set(tx_size, mbmi->sb_type, inter_block);
+      FRAME_COUNTS *counts = xd->counts;
+
+      if (inter_block) {
+        if (eset > 0) {
+          mbmi->tx_type = aom_read_tree(
+              r, av1_ext_tx_inter_tree[eset],
+              cm->fc->inter_ext_tx_prob[eset][txsize_sqr_map[tx_size]],
+              ACCT_STR);
+          if (counts)
+            ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+                                  [mbmi->tx_type];
+        }
+      } else if (ALLOW_INTRA_EXT_TX) {
+        if (eset > 0) {
+          mbmi->tx_type = aom_read_tree(
+              r, av1_ext_tx_intra_tree[eset],
+              cm->fc->intra_ext_tx_prob[eset][tx_size][mbmi->mode], ACCT_STR);
+          if (counts)
+            ++counts->intra_ext_tx[eset][tx_size][mbmi->mode][mbmi->tx_type];
+        }
+      }
+    } else {
+      mbmi->tx_type = DCT_DCT;
+    }
+#else
+    if (tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      FRAME_COUNTS *counts = xd->counts;
+      if (inter_block) {
+#if CONFIG_DAALA_EC
+        mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
+            r, cm->fc->inter_ext_tx_cdf[tx_size], TX_TYPES, ACCT_STR)];
+#else
+        mbmi->tx_type = aom_read_tree(
+            r, av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[tx_size], ACCT_STR);
+#endif
+        if (counts) ++counts->inter_ext_tx[tx_size][mbmi->tx_type];
+      } else {
+        const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+#if CONFIG_DAALA_EC
+        mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
+            r, cm->fc->intra_ext_tx_cdf[tx_size][tx_type_nom], TX_TYPES,
+            ACCT_STR)];
+#else
+        mbmi->tx_type = aom_read_tree(
+            r, av1_ext_tx_tree, cm->fc->intra_ext_tx_prob[tx_size][tx_type_nom],
+            ACCT_STR);
+#endif
+        if (counts) ++counts->intra_ext_tx[tx_size][tx_type_nom][mbmi->tx_type];
+      }
+    } else {
+      mbmi->tx_type = DCT_DCT;
+    }
+#endif  // CONFIG_EXT_TX
+  }
+}
+
 static void read_intra_frame_mode_info(AV1_COMMON *const cm,
                                        MACROBLOCKD *const xd, int mi_row,
                                        int mi_col, aom_reader *r) {
@@ -484,7 +778,7 @@
   }
 #endif
 
-  mbmi->tx_size = read_tx_size(cm, xd, 1, r);
+  mbmi->tx_size = read_tx_size_intra(cm, xd, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
@@ -538,7 +832,7 @@
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
 #if CONFIG_EXT_INTRA
-  read_intra_angle_info(mbmi, r);
+  read_intra_angle_info(cm, xd, r);
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[0] = 0;
@@ -546,25 +840,17 @@
   if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
     read_palette_mode_info(cm, xd, r);
 #endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+  if (bsize >= BLOCK_8X8) read_filter_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_FILTER_INTRA
 
-  if (mbmi->tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    FRAME_COUNTS *counts = xd->counts;
-    TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-#if CONFIG_DAALA_EC
-    mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
-        r, cm->fc->intra_ext_tx_cdf[mbmi->tx_size][tx_type_nom], TX_TYPES,
-        ACCT_STR)];
-#else
-    mbmi->tx_type = aom_read_tree(
-        r, av1_ext_tx_tree,
-        cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom], ACCT_STR);
+  read_tx_type(cm, xd, mbmi,
+#if CONFIG_SUPERTX
+               0,
 #endif
-    if (counts)
-      ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
-  } else {
-    mbmi->tx_type = DCT_DCT;
-  }
+               r);
 }
 
 static int read_mv_component(aom_reader *r, nmv_component *mvcomp, int usehp) {
@@ -612,13 +898,14 @@
 static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
                            nmv_context *ctx, nmv_context_counts *counts,
                            int allow_hp) {
-  const MV_JOINT_TYPE joint_type =
+  MV_JOINT_TYPE joint_type;
+  MV diff = { 0, 0 };
+  joint_type =
 #if CONFIG_EC_MULTISYMBOL
       (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joint_cdf, MV_JOINTS, ACCT_STR);
 #else
       (MV_JOINT_TYPE)aom_read_tree(r, av1_mv_joint_tree, ctx->joints, ACCT_STR);
 #endif
-  MV diff = { 0, 0 };
 
   if (mv_joint_vertical(joint_type))
     diff.row = read_mv_component(r, &ctx->comps[0], allow_hp);
@@ -664,25 +951,29 @@
     if (mode == COMPOUND_REFERENCE) {
 #if CONFIG_EXT_REFS
       const int idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
-      // Read forward references.
-      const int ctx_fwd = av1_get_pred_context_comp_fwdref_p(cm, xd);
-      const int bit_fwd =
-          aom_read(r, fc->comp_fwdref_prob[ctx_fwd][0], ACCT_STR);
-      if (counts) ++counts->comp_fwdref[ctx_fwd][0][bit_fwd];
-      if (!bit_fwd) {
-        const int ctx_fwd1 = av1_get_pred_context_comp_fwdref_p1(cm, xd);
-        const int bit_fwd1 =
-            aom_read(r, fc->comp_fwdref_prob[ctx_fwd1][1], ACCT_STR);
-        if (counts) ++counts->comp_fwdref[ctx_fwd1][1][bit_fwd1];
-        ref_frame[!idx] = cm->comp_fwd_ref[bit_fwd1 ? 0 : 1];
+#else
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+#endif  // CONFIG_EXT_REFS
+      const int ctx = av1_get_pred_context_comp_ref_p(cm, xd);
+      const int bit = aom_read(r, fc->comp_ref_prob[ctx][0], ACCT_STR);
+
+      if (counts) ++counts->comp_ref[ctx][0][bit];
+
+#if CONFIG_EXT_REFS
+      // Decode forward references.
+      if (!bit) {
+        const int ctx1 = av1_get_pred_context_comp_ref_p1(cm, xd);
+        const int bit1 = aom_read(r, fc->comp_ref_prob[ctx1][1], ACCT_STR);
+        if (counts) ++counts->comp_ref[ctx1][1][bit1];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 0 : 1];
       } else {
-        const int ctx_fwd2 = av1_get_pred_context_comp_fwdref_p2(cm, xd);
-        const int bit_fwd2 =
-            aom_read(r, fc->comp_fwdref_prob[ctx_fwd2][2], ACCT_STR);
-        if (counts) ++counts->comp_fwdref[ctx_fwd2][2][bit_fwd2];
-        ref_frame[!idx] = cm->comp_fwd_ref[bit_fwd2 ? 3 : 2];
+        const int ctx2 = av1_get_pred_context_comp_ref_p2(cm, xd);
+        const int bit2 = aom_read(r, fc->comp_ref_prob[ctx2][2], ACCT_STR);
+        if (counts) ++counts->comp_ref[ctx2][2][bit2];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
       }
-      // Read backward references.
+
+      // Decode backward references.
       {
         const int ctx_bwd = av1_get_pred_context_comp_bwdref_p(cm, xd);
         const int bit_bwd =
@@ -691,18 +982,15 @@
         ref_frame[idx] = cm->comp_bwd_ref[bit_bwd];
       }
 #else
-      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-      const int ctx = av1_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = aom_read(r, fc->comp_ref_prob[ctx], ACCT_STR);
-      if (counts) ++counts->comp_ref[ctx][bit];
-      ref_frame[idx] = cm->comp_fixed_ref;
       ref_frame[!idx] = cm->comp_var_ref[bit];
+      ref_frame[idx] = cm->comp_fixed_ref;
 #endif  // CONFIG_EXT_REFS
     } else if (mode == SINGLE_REFERENCE) {
 #if CONFIG_EXT_REFS
       const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
       const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0], ACCT_STR);
       if (counts) ++counts->single_ref[ctx0][0][bit0];
+
       if (bit0) {
         const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
         const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1], ACCT_STR);
@@ -712,22 +1000,23 @@
         const int ctx2 = av1_get_pred_context_single_ref_p3(xd);
         const int bit2 = aom_read(r, fc->single_ref_prob[ctx2][2], ACCT_STR);
         if (counts) ++counts->single_ref[ctx2][2][bit2];
-        if (!bit2) {
-          const int ctx3 = av1_get_pred_context_single_ref_p4(xd);
-          const int bit3 = aom_read(r, fc->single_ref_prob[ctx3][3], ACCT_STR);
-          if (counts) ++counts->single_ref[ctx3][3][bit3];
-          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
-        } else {
+        if (bit2) {
           const int ctx4 = av1_get_pred_context_single_ref_p5(xd);
           const int bit4 = aom_read(r, fc->single_ref_prob[ctx4][4], ACCT_STR);
           if (counts) ++counts->single_ref[ctx4][4][bit4];
           ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
+        } else {
+          const int ctx3 = av1_get_pred_context_single_ref_p4(xd);
+          const int bit3 = aom_read(r, fc->single_ref_prob[ctx3][3], ACCT_STR);
+          if (counts) ++counts->single_ref[ctx3][3][bit3];
+          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
         }
       }
 #else
       const int ctx0 = av1_get_pred_context_single_ref_p1(xd);
       const int bit0 = aom_read(r, fc->single_ref_prob[ctx0][0], ACCT_STR);
       if (counts) ++counts->single_ref[ctx0][0][bit0];
+
       if (bit0) {
         const int ctx1 = av1_get_pred_context_single_ref_p2(xd);
         const int bit1 = aom_read(r, fc->single_ref_prob[ctx1][1], ACCT_STR);
@@ -745,32 +1034,36 @@
   }
 }
 
-static INLINE InterpFilter read_switchable_interp_filter(AV1_COMMON *const cm,
-                                                         MACROBLOCKD *const xd,
-                                                         aom_reader *r) {
-  if (cm->interp_filter == SWITCHABLE) {
+static INLINE InterpFilter read_interp_filter(AV1_COMMON *const cm,
+                                              MACROBLOCKD *const xd,
+#if CONFIG_DUAL_FILTER
+                                              int dir,
+#endif
+                                              aom_reader *r) {
 #if CONFIG_EXT_INTERP
-    if (is_interp_needed(xd))
+  if (!av1_is_interp_needed(xd)) return EIGHTTAP_REGULAR;
 #endif
-    {
-      const int ctx = av1_get_pred_context_switchable_interp(xd);
-#if CONFIG_DAALA_EC
-      const InterpFilter type =
-          (InterpFilter)av1_switchable_interp_inv[aom_read_symbol(
-              r, cm->fc->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS,
-              ACCT_STR)];
-#else
-      const InterpFilter type = (InterpFilter)aom_read_tree(
-          r, av1_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx],
-          ACCT_STR);
-#endif
-      FRAME_COUNTS *counts = xd->counts;
-      if (counts) ++counts->switchable_interp[ctx][type];
-      return type;
-    }
-    return EIGHTTAP;
-  } else {
+  if (cm->interp_filter != SWITCHABLE) {
     return cm->interp_filter;
+  } else {
+#if CONFIG_DUAL_FILTER
+    const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+#else
+    const int ctx = av1_get_pred_context_switchable_interp(xd);
+#endif
+    FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_DAALA_EC
+    const InterpFilter type =
+        (InterpFilter)av1_switchable_interp_inv[aom_read_symbol(
+            r, cm->fc->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS,
+            ACCT_STR)];
+#else
+    const InterpFilter type = (InterpFilter)aom_read_tree(
+        r, av1_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx],
+        ACCT_STR);
+#endif
+    if (counts) ++counts->switchable_interp[ctx][type];
+    return type;
   }
 }
 
@@ -806,7 +1099,7 @@
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
 #if CONFIG_EXT_INTRA
-  read_intra_angle_info(mbmi, r);
+  read_intra_angle_info(cm, xd, r);
 #endif  // CONFIG_EXT_INTRA
 #if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[0] = 0;
@@ -814,6 +1107,11 @@
   if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
     read_palette_mode_info(cm, xd, r);
 #endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+  if (bsize >= BLOCK_8X8) read_filter_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_FILTER_INTRA
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -822,13 +1120,13 @@
 }
 
 static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
-                            PREDICTION_MODE mode, int block, int_mv mv[2],
-                            int_mv ref_mv[2], int_mv nearest_mv[2],
-                            int_mv near_mv[2], int is_compound, int allow_hp,
-                            aom_reader *r) {
+                            PREDICTION_MODE mode,
+                            MV_REFERENCE_FRAME ref_frame[2], int block,
+                            int_mv mv[2], int_mv ref_mv[2],
+                            int_mv nearest_mv[2], int_mv near_mv[2],
+                            int is_compound, int allow_hp, aom_reader *r) {
   int i;
   int ret = 1;
-
 #if CONFIG_REF_MV
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   BLOCK_SIZE bsize = mbmi->sb_type;
@@ -837,8 +1135,12 @@
 #else
   (void)block;
 #endif
+  (void)ref_frame;
 
   switch (mode) {
+#if CONFIG_EXT_INTER
+    case NEWFROMNEARMV:
+#endif  // CONFIG_EXT_INTER
     case NEWMV: {
       FRAME_COUNTS *counts = xd->counts;
 #if !CONFIG_REF_MV
@@ -859,6 +1161,7 @@
                 allow_hp);
 #endif
         ret = ret && is_mv_valid(&mv[i].as_mv);
+
 #if CONFIG_REF_MV
         pred_mv[i].as_int = ref_mv[i].as_int;
 #endif
@@ -868,6 +1171,7 @@
     case NEARESTMV: {
       mv[0].as_int = nearest_mv[0].as_int;
       if (is_compound) mv[1].as_int = nearest_mv[1].as_int;
+
 #if CONFIG_REF_MV
       pred_mv[0].as_int = nearest_mv[0].as_int;
       if (is_compound) pred_mv[1].as_int = nearest_mv[1].as_int;
@@ -877,6 +1181,7 @@
     case NEARMV: {
       mv[0].as_int = near_mv[0].as_int;
       if (is_compound) mv[1].as_int = near_mv[1].as_int;
+
 #if CONFIG_REF_MV
       pred_mv[0].as_int = near_mv[0].as_int;
       if (is_compound) pred_mv[1].as_int = near_mv[1].as_int;
@@ -884,14 +1189,162 @@
       break;
     }
     case ZEROMV: {
+#if CONFIG_GLOBAL_MOTION
+      mv[0].as_int =
+          gm_get_motion_vector(&cm->global_motion[ref_frame[0]]).as_int;
+      if (is_compound)
+        mv[1].as_int =
+            gm_get_motion_vector(&cm->global_motion[ref_frame[1]]).as_int;
+#else
       mv[0].as_int = 0;
       if (is_compound) mv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+
 #if CONFIG_REF_MV
       pred_mv[0].as_int = 0;
       if (is_compound) pred_mv[1].as_int = 0;
 #endif
       break;
     }
+#if CONFIG_EXT_INTER
+    case NEW_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if !CONFIG_REF_MV
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      assert(is_compound);
+      for (i = 0; i < 2; ++i) {
+#if CONFIG_REF_MV
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(xd->ref_mv_count[rf_type], xd->ref_mv_stack[rf_type], i,
+                        mbmi->ref_mv_idx);
+        nmv_context_counts *const mv_counts =
+            counts ? &counts->mv[nmv_ctx] : NULL;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc[nmv_ctx],
+                mv_counts, allow_hp);
+#else
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
+                allow_hp);
+#endif
+        ret = ret && is_mv_valid(&mv[i].as_mv);
+      }
+      break;
+    }
+    case NEAREST_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEW_NEARESTMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc->nmvc[nmv_ctx],
+              mv_counts, allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[0].as_mv);
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      mv[0].as_int = nearest_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc->nmvc[nmv_ctx],
+              mv_counts, allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      mv[0].as_int = nearest_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[1].as_mv);
+      break;
+    }
+    case NEAR_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      mv[0].as_int = near_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc->nmvc[nmv_ctx],
+              mv_counts, allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      mv[0].as_int = near_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+
+      ret = ret && is_mv_valid(&mv[1].as_mv);
+      break;
+    }
+    case NEW_NEARMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx = av1_nmv_ctx(xd->ref_mv_count[rf_type],
+                                xd->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc->nmvc[nmv_ctx],
+              mv_counts, allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[0].as_mv);
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case ZERO_ZEROMV: {
+      assert(is_compound);
+      mv[0].as_int = 0;
+      mv[1].as_int = 0;
+      break;
+    }
+#endif  // CONFIG_EXT_INTER
     default: { return 0; }
   }
   return ret;
@@ -913,22 +1366,34 @@
 static void fpm_sync(void *const data, int mi_row) {
   AV1Decoder *const pbi = (AV1Decoder *)data;
   av1_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << MAX_MIB_SIZE_LOG2);
+                       mi_row << pbi->common.mib_size_log2);
 }
 
 static void read_inter_block_mode_info(AV1Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       MODE_INFO *const mi, int mi_row,
-                                       int mi_col, aom_reader *r) {
+                                       MODE_INFO *const mi,
+#if (CONFIG_MOTION_VAR || CONFIG_EXT_INTER) && CONFIG_SUPERTX
+                                       int mi_row, int mi_col, aom_reader *r,
+                                       int supertx_enabled) {
+#else
+                                       int mi_row, int mi_col, aom_reader *r) {
+#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
   AV1_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
   int_mv nearestmv[2], nearmv[2];
   int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+#if CONFIG_EXT_INTER
+  int mv_idx;
+#endif  // CONFIG_EXT_INTER
   int ref, is_compound;
   int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+  int16_t compound_inter_mode_ctx[MODE_CTX_REF_FRAMES];
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
   int16_t mode_ctx = 0;
+  MV_REFERENCE_FRAME ref_frame;
 
 #if CONFIG_PALETTE
   mbmi->palette_mode_info.palette_size[0] = 0;
@@ -939,7 +1404,7 @@
   is_compound = has_second_ref(mbmi);
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
     xd->block_refs[ref] = ref_buf;
@@ -947,22 +1412,29 @@
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
     av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf);
+  }
 
-    av1_find_mv_refs(cm, xd, mi, frame,
+  for (ref_frame = LAST_FRAME; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+    av1_find_mv_refs(cm, xd, mi, ref_frame,
 #if CONFIG_REF_MV
-                     &xd->ref_mv_count[frame], xd->ref_mv_stack[frame],
+                     &xd->ref_mv_count[ref_frame], xd->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                     compound_inter_mode_ctx,
+#endif  // CONFIG_EXT_INTER
 #endif
-                     ref_mvs[frame], mi_row, mi_col, fpm_sync, (void *)pbi,
+                     ref_mvs[ref_frame], mi_row, mi_col, fpm_sync, (void *)pbi,
                      inter_mode_ctx);
   }
 
 #if CONFIG_REF_MV
-  if (is_compound) {
-    MV_REFERENCE_FRAME ref_frame;
-    ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+  for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
     av1_find_mv_refs(cm, xd, mi, ref_frame, &xd->ref_mv_count[ref_frame],
-                     xd->ref_mv_stack[ref_frame], ref_mvs[ref_frame], mi_row,
-                     mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
+                     xd->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                     compound_inter_mode_ctx,
+#endif  // CONFIG_EXT_INTER
+                     ref_mvs[ref_frame], mi_row, mi_col, fpm_sync, (void *)pbi,
+                     inter_mode_ctx);
 
     if (xd->ref_mv_count[ref_frame] < 2) {
       MV_REFERENCE_FRAME rf[2];
@@ -978,8 +1450,13 @@
     }
   }
 
-  mode_ctx =
-      av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame, bsize, -1);
+#if CONFIG_EXT_INTER
+  if (is_compound)
+    mode_ctx = compound_inter_mode_ctx[mbmi->ref_frame[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+    mode_ctx =
+        av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame, bsize, -1);
   mbmi->ref_mv_idx = 0;
 #else
   mode_ctx = inter_mode_ctx[mbmi->ref_frame[0]];
@@ -989,12 +1466,21 @@
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
       aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
-                         "Invalid usage of segment feature on small blocks");
+                         "Invalid usage of segement feature on small blocks");
       return;
     }
   } else {
     if (bsize >= BLOCK_8X8) {
-      mbmi->mode = read_inter_mode(cm, xd, r, mode_ctx);
+#if CONFIG_EXT_INTER
+      if (is_compound)
+        mbmi->mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
+      else
+#endif  // CONFIG_EXT_INTER
+        mbmi->mode = read_inter_mode(cm, xd,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                     mbmi,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                                     r, mode_ctx);
 #if CONFIG_REF_MV
       if (mbmi->mode == NEARMV || mbmi->mode == NEWMV)
         read_drl_idx(cm, xd, mbmi, r);
@@ -1002,7 +1488,12 @@
     }
   }
 
+#if CONFIG_EXT_INTER
+  if (bsize < BLOCK_8X8 ||
+      (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV)) {
+#else
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
                             &nearestmv[ref], &nearmv[ref]);
@@ -1013,41 +1504,69 @@
   if (mbmi->ref_mv_idx > 0) {
     int_mv cur_mv =
         xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
-    lower_mv_precision(&cur_mv.as_mv, cm->allow_high_precision_mv);
     nearmv[0] = cur_mv;
   }
 
+#if CONFIG_EXT_INTER
+  if (is_compound && bsize >= BLOCK_8X8 && mbmi->mode != ZERO_ZEROMV) {
+#else
   if (is_compound && bsize >= BLOCK_8X8 && mbmi->mode != NEWMV &&
       mbmi->mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
     uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
 
+#if CONFIG_EXT_INTER
+    if (xd->ref_mv_count[ref_frame_type] > 0) {
+#else
     if (xd->ref_mv_count[ref_frame_type] == 1 && mbmi->mode == NEARESTMV) {
-      int i;
-      nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
-      nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
-
-      for (i = 0; i < 2; ++i) lower_mv_precision(&nearestmv[i].as_mv, allow_hp);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER
+      if (mbmi->mode == NEAREST_NEARESTMV) {
+#endif  // CONFIG_EXT_INTER
+        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
+        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
+#if CONFIG_EXT_INTER
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAREST_NEARMV) {
+        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEAR_NEARESTMV) {
+        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
+      }
+#endif  // CONFIG_EXT_INTER
     }
 
+#if CONFIG_EXT_INTER
     if (xd->ref_mv_count[ref_frame_type] > 1) {
-      int i;
-      const int ref_mv_idx = 1 + mbmi->ref_mv_idx;
+      if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEAR_NEARESTMV ||
+          mbmi->mode == NEAR_NEARMV) {
+        nearmv[0] = xd->ref_mv_stack[ref_frame_type][1].this_mv;
+        lower_mv_precision(&nearmv[0].as_mv, allow_hp);
+      }
+
+      if (mbmi->mode == NEW_NEARMV || mbmi->mode == NEAREST_NEARMV ||
+          mbmi->mode == NEAR_NEARMV) {
+        nearmv[1] = xd->ref_mv_stack[ref_frame_type][1].comp_mv;
+        lower_mv_precision(&nearmv[1].as_mv, allow_hp);
+      }
+    }
+#else
+    if (xd->ref_mv_count[ref_frame_type] > 1) {
+      int ref_mv_idx = 1 + mbmi->ref_mv_idx;
       nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
       nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
       nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
       nearmv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
-
-      for (i = 0; i < 2; ++i) {
-        lower_mv_precision(&nearestmv[i].as_mv, allow_hp);
-        lower_mv_precision(&nearmv[i].as_mv, allow_hp);
-      }
     }
+#endif  // CONFIG_EXT_INTER
   }
 #endif
 
-#if !CONFIG_EXT_INTERP
-  mbmi->interp_filter = read_switchable_interp_filter(cm, xd, r);
-#endif  // CONFIG_EXT_INTERP
+#if !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
+  mbmi->interp_filter = read_interp_filter(cm, xd, r);
+#endif  // !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
 
   if (bsize < BLOCK_8X8) {
     const int num_4x4_w = 1 << xd->bmode_blocks_wl;
@@ -1055,30 +1574,95 @@
     int idx, idy;
     PREDICTION_MODE b_mode;
     int_mv nearest_sub8x8[2], near_sub8x8[2];
+#if CONFIG_EXT_INTER
+    int_mv ref_mv[2][2];
+#endif  // CONFIG_EXT_INTER
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
         const int j = idy * 2 + idx;
+        int_mv ref_mv_s8[2];
 #if CONFIG_REF_MV
-        mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame,
-                                             bsize, j);
+#if CONFIG_EXT_INTER
+        if (!is_compound)
+#endif  // CONFIG_EXT_INTER
+          mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame,
+                                               bsize, j);
 #endif
-        b_mode = read_inter_mode(cm, xd, r, mode_ctx);
+#if CONFIG_EXT_INTER
+        if (is_compound)
+          b_mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
+        else
+#endif  // CONFIG_EXT_INTER
+          b_mode = read_inter_mode(cm, xd,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                   mbmi,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                                   r, mode_ctx);
 
-        if (b_mode == NEARESTMV || b_mode == NEARMV) {
+#if CONFIG_EXT_INTER
+        mv_idx = (b_mode == NEWFROMNEARMV) ? 1 : 0;
+
+        if (b_mode != ZEROMV && b_mode != ZERO_ZEROMV) {
+#else
+        if (b_mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
+          CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+          uint8_t ref_mv_count[2];
+#endif
           for (ref = 0; ref < 1 + is_compound; ++ref)
+#if CONFIG_EXT_INTER
+          {
+            int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
+            av1_update_mv_context(xd, mi, mbmi->ref_frame[ref], mv_ref_list, j,
+                                  mi_row, mi_col, NULL);
+#endif  // CONFIG_EXT_INTER
             av1_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+                                          ref_mv_stack[ref], &ref_mv_count[ref],
+#endif
+#if CONFIG_EXT_INTER
+                                          mv_ref_list,
+#endif  // CONFIG_EXT_INTER
                                           &nearest_sub8x8[ref],
                                           &near_sub8x8[ref]);
+#if CONFIG_EXT_INTER
+            if (have_newmv_in_inter_mode(b_mode)) {
+              mv_ref_list[0].as_int = nearest_sub8x8[ref].as_int;
+              mv_ref_list[1].as_int = near_sub8x8[ref].as_int;
+              av1_find_best_ref_mvs(allow_hp, mv_ref_list, &ref_mv[0][ref],
+                                    &ref_mv[1][ref]);
+            }
+          }
+#endif  // CONFIG_EXT_INTER
         }
 
-        if (!assign_mv(cm, xd, b_mode, j, block, nearestmv, nearest_sub8x8,
-                       near_sub8x8, is_compound, allow_hp, r)) {
+        for (ref = 0; ref < 1 + is_compound && b_mode != ZEROMV; ++ref) {
+#if CONFIG_REF_MV
+          ref_mv_s8[ref] = nearest_sub8x8[ref];
+          lower_mv_precision(&ref_mv_s8[ref].as_mv, allow_hp);
+#else
+          ref_mv_s8[ref] = nearestmv[ref];
+#endif
+        }
+#if CONFIG_EXT_INTER
+        (void)ref_mv_s8;
+#endif
+
+        if (!assign_mv(cm, xd, b_mode, mbmi->ref_frame, j, block,
+#if CONFIG_EXT_INTER
+                       ref_mv[mv_idx],
+#else
+                       ref_mv_s8,
+#endif  // CONFIG_EXT_INTER
+                       nearest_sub8x8, near_sub8x8, is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
           break;
         };
 
         mi->bmi[j].as_mv[0].as_int = block[0].as_int;
+        mi->bmi[j].as_mode = b_mode;
         if (is_compound) mi->bmi[j].as_mv[1].as_int = block[1].as_int;
 
         if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
@@ -1086,102 +1670,269 @@
       }
     }
 
-    mi->mbmi.mode = b_mode;
 #if CONFIG_REF_MV
     mbmi->pred_mv[0].as_int = mi->bmi[3].pred_mv[0].as_int;
     mbmi->pred_mv[1].as_int = mi->bmi[3].pred_mv[1].as_int;
 #endif
+    mi->mbmi.mode = b_mode;
+
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-#if CONFIG_REF_MV
+    int_mv ref_mv[2];
+    ref_mv[0] = nearestmv[0];
+    ref_mv[1] = nearestmv[1];
+
     for (ref = 0; ref < 1 + is_compound && mbmi->mode == NEWMV; ++ref) {
-      int_mv ref_mv = nearestmv[ref];
+#if CONFIG_REF_MV
       uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
       if (xd->ref_mv_count[ref_frame_type] > 1) {
-        ref_mv =
+        ref_mv[ref] =
             (ref == 0)
                 ? xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].this_mv
                 : xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].comp_mv;
-        clamp_mv_ref(&ref_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
-        lower_mv_precision(&ref_mv.as_mv, allow_hp);
+        clamp_mv_ref(&ref_mv[ref].as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
       }
-      nearestmv[ref] = ref_mv;
-    }
 #endif
-    xd->corrupted |= !assign_mv(cm, xd, mbmi->mode, 0, mbmi->mv, nearestmv,
-                                nearestmv, nearmv, is_compound, allow_hp, r);
+      nearestmv[ref] = ref_mv[ref];
+    }
+
+    xd->corrupted |=
+        !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, 0, mbmi->mv,
+#if CONFIG_EXT_INTER
+                   mbmi->mode == NEWFROMNEARMV ? nearmv : nearestmv,
+#else
+                   ref_mv,
+#endif  // CONFIG_EXT_INTER
+                   nearestmv, nearmv, is_compound, allow_hp, r);
   }
-#if CONFIG_MOTION_VAR
-  mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
-#endif  // CONFIG_MOTION_VAR
+
+#if CONFIG_EXT_INTER
+  mbmi->use_wedge_interintra = 0;
+  if (cm->reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
+      is_interintra_allowed(mbmi)) {
+    const int bsize_group = size_group_lookup[bsize];
+    const int interintra =
+        aom_read(r, cm->fc->interintra_prob[bsize_group], ACCT_STR);
+    if (xd->counts) xd->counts->interintra[bsize_group][interintra]++;
+    assert(mbmi->ref_frame[1] == NONE);
+    if (interintra) {
+      const INTERINTRA_MODE interintra_mode =
+          read_interintra_mode(cm, xd, r, bsize_group);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      mbmi->interintra_mode = interintra_mode;
+#if CONFIG_EXT_INTRA
+      mbmi->angle_delta[0] = 0;
+      mbmi->angle_delta[1] = 0;
+      mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+      if (is_interintra_wedge_used(bsize)) {
+        mbmi->use_wedge_interintra =
+            aom_read(r, cm->fc->wedge_interintra_prob[bsize], ACCT_STR);
+        if (xd->counts)
+          xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+        if (mbmi->use_wedge_interintra) {
+          mbmi->interintra_wedge_index =
+              aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
+          mbmi->interintra_wedge_sign = 0;
+        }
+      }
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+    if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+      mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+  mbmi->use_wedge_interinter = 0;
+  if (cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_compound_mode(mbmi->mode) &&
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      !(is_motion_variation_allowed(mbmi) &&
+        mbmi->motion_mode != SIMPLE_TRANSLATION) &&
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      is_interinter_wedge_used(bsize)) {
+    mbmi->use_wedge_interinter =
+        aom_read(r, cm->fc->wedge_interinter_prob[bsize], ACCT_STR);
+    if (xd->counts)
+      xd->counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
+    if (mbmi->use_wedge_interinter) {
+      mbmi->interinter_wedge_index =
+          aom_read_literal(r, get_wedge_bits_lookup(bsize), ACCT_STR);
+      mbmi->interinter_wedge_sign = aom_read_bit(r, ACCT_STR);
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_DUAL_FILTER
+  for (ref = 0; ref < 2; ++ref) {
+    mbmi->interp_filter[ref] = (cm->interp_filter == SWITCHABLE)
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+
+    if (has_subpel_mv_component(xd->mi[0], xd, ref) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, ref + 2)))
+      mbmi->interp_filter[ref] = read_interp_filter(cm, xd, ref, r);
+  }
+  // The index system worsk as:
+  // (0, 1) -> (vertical, horizontal) filter types for the first ref frame.
+  // (2, 3) -> (vertical, horizontal) filter types for the second ref frame.
+  mbmi->interp_filter[2] = mbmi->interp_filter[0];
+  mbmi->interp_filter[3] = mbmi->interp_filter[1];
+#else
 #if CONFIG_EXT_INTERP
-  mbmi->interp_filter = read_switchable_interp_filter(cm, xd, r);
+  mbmi->interp_filter = read_interp_filter(cm, xd, r);
 #endif  // CONFIG_EXT_INTERP
+#endif  // CONFIG_DUAL_FILTER
 }
 
 static void read_inter_frame_mode_info(AV1Decoder *const pbi,
-                                       MACROBLOCKD *const xd, int mi_row,
-                                       int mi_col, aom_reader *r) {
+                                       MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                                       int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                                       int mi_row, int mi_col, aom_reader *r) {
   AV1_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int inter_block;
+  int inter_block = 1;
+#if CONFIG_VAR_TX
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#endif  // CONFIG_VAR_TX
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
   mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif  // CONFIG_SUPERTX
+    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
 #if CONFIG_DELTA_Q
-  if (cm->delta_q_present_flag) {
-    xd->current_qindex =
-        xd->prev_qindex +
-        read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
-    xd->prev_qindex = xd->current_qindex;
-  }
+    if (cm->delta_q_present_flag) {
+      xd->current_qindex =
+          xd->prev_qindex +
+          read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+      xd->prev_qindex = xd->current_qindex;
+    }
 #endif
-  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+    if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT && !mbmi->skip &&
+        inter_block) {
+      const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+      const int bh = tx_size_high_unit[max_tx_size];
+      const int bw = tx_size_wide_unit[max_tx_size];
+      const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+      const int height = block_size_high[bsize] >> tx_size_wide_log2[0];
+      int idx, idy;
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      int is_rect_tx_allowed = inter_block && is_rect_tx_allowed_bsize(bsize) &&
+                               !xd->lossless[mbmi->segment_id];
+      int use_rect_tx = 0;
+      int tx_size_cat = inter_tx_size_cat_lookup[bsize];
+      if (is_rect_tx_allowed) {
+        use_rect_tx = aom_read(r, cm->fc->rect_tx_prob[tx_size_cat], ACCT_STR);
+        if (xd->counts) {
+          ++xd->counts->rect_tx[tx_size_cat][use_rect_tx];
+        }
+      }
+
+      if (use_rect_tx) {
+        mbmi->tx_size = max_txsize_rect_lookup[bsize];
+        set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
+      } else {
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+        mbmi->min_tx_size = TX_SIZES_ALL;
+        for (idy = 0; idy < height; idy += bh)
+          for (idx = 0; idx < width; idx += bw)
+            read_tx_size_vartx(cm, xd, mbmi, xd->counts, max_tx_size,
+                               height != width, idy, idx, r);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      }
+#endif
+    } else {
+      if (inter_block)
+        mbmi->tx_size = read_tx_size_inter(cm, xd, !mbmi->skip, r);
+      else
+        mbmi->tx_size = read_tx_size_intra(cm, xd, r);
+
+      if (inter_block) {
+        const int width = num_4x4_blocks_wide_lookup[bsize];
+        const int height = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < height; ++idy)
+          for (idx = 0; idx < width; ++idx)
+            mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
+      }
+      mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
+    }
+#else
+  if (inter_block)
+    mbmi->tx_size = read_tx_size_inter(cm, xd, !mbmi->skip, r);
+  else
+    mbmi->tx_size = read_tx_size_intra(cm, xd, r);
+#endif  // CONFIG_VAR_TX
+#if CONFIG_SUPERTX
+  }
+#if CONFIG_VAR_TX
+  else if (inter_block) {
+    const int width = num_4x4_blocks_wide_lookup[bsize];
+    const int height = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+    xd->mi[0]->mbmi.tx_size = xd->supertx_size;
+    for (idy = 0; idy < height; ++idy)
+      for (idx = 0; idx < width; ++idx)
+        xd->mi[0]->mbmi.inter_tx_size[idy >> 1][idx >> 1] = xd->supertx_size;
+  }
+#endif  // CONFIG_VAR_TX
+#endif  // CONFIG_SUPERTX
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd,
+#if (CONFIG_MOTION_VAR || CONFIG_EXT_INTER) && CONFIG_SUPERTX
+
+                               mi, mi_row, mi_col, r, supertx_enabled);
+#else
+                               mi, mi_row, mi_col, r);
+#endif  // CONFIG_MOTION_VAR && CONFIG_SUPERTX
   else
     read_intra_block_mode_info(cm, xd, mi, r);
 
-  if (mbmi->tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    FRAME_COUNTS *counts = xd->counts;
-    if (inter_block) {
-#if CONFIG_DAALA_EC
-      mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
-          r, cm->fc->inter_ext_tx_cdf[mbmi->tx_size], TX_TYPES, ACCT_STR)];
-#else
-      mbmi->tx_type =
-          aom_read_tree(r, av1_ext_tx_tree,
-                        cm->fc->inter_ext_tx_prob[mbmi->tx_size], ACCT_STR);
+  read_tx_type(cm, xd, mbmi,
+#if CONFIG_SUPERTX
+               supertx_enabled,
 #endif
-      if (counts) ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
-    } else {
-      const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-#if CONFIG_DAALA_EC
-      mbmi->tx_type = av1_ext_tx_inv[aom_read_symbol(
-          r, cm->fc->intra_ext_tx_cdf[mbmi->tx_size][tx_type_nom], TX_TYPES,
-          ACCT_STR)];
-#else
-      mbmi->tx_type = aom_read_tree(
-          r, av1_ext_tx_tree,
-          cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom], ACCT_STR);
-#endif
-      if (counts)
-        ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
-    }
-  } else {
-    mbmi->tx_type = DCT_DCT;
-  }
+               r);
 }
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
-                        int mi_col, aom_reader *r, int x_mis, int y_mis) {
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                        int mi_row, int mi_col, aom_reader *r, int x_mis,
+                        int y_mis) {
   AV1_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MV_REF *frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
@@ -1189,9 +1940,22 @@
 
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+#if CONFIG_REF_MV
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = NONE;
+        mv->ref_frame[1] = NONE;
+      }
+    }
+#endif
   } else {
-    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
-
+    read_inter_frame_mode_info(pbi, xd,
+#if CONFIG_SUPERTX
+                               supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                               mi_row, mi_col, r);
     for (h = 0; h < y_mis; ++h) {
       MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
       for (w = 0; w < x_mis; ++w) {

diff --git a/av1/decoder/decodemv.h b/av1/decoder/decodemv.h
index 6a714e5..e916262 100644
--- a/av1/decoder/decodemv.h
+++ b/av1/decoder/decodemv.h

@@ -20,8 +20,13 @@
 extern "C" {
 #endif
 
-void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
-                        int mi_col, aom_reader *r, int x_mis, int y_mis);
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
+#if CONFIG_SUPERTX
+                        int supertx_enabled,
+#endif
+
+                        int mi_row, int mi_col, aom_reader *r, int x_mis,
+                        int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/av1/decoder/decoder.c b/av1/decoder/decoder.c
index 1b12476..7547656 100644
--- a/av1/decoder/decoder.c
+++ b/av1/decoder/decoder.c

@@ -28,6 +28,7 @@
 #include "av1/common/loopfilter.h"
 #include "av1/common/onyxc_int.h"
 #include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
 #include "av1/common/reconintra.h"
 
 #include "av1/decoder/decodeframe.h"
@@ -45,6 +46,9 @@
     aom_dsp_rtcd();
     aom_scale_rtcd();
     av1_init_intra_predictors();
+#if CONFIG_EXT_INTER
+    av1_init_wedge_masks();
+#endif  // CONFIG_EXT_INTER
     init_done = 1;
 #if CONFIG_DAALA_EC
     av1_indices_from_tree(av1_switchable_interp_ind, av1_switchable_interp_inv,
@@ -129,10 +133,14 @@
 #if CONFIG_AOM_QM
   aom_qm_init(cm);
 #endif
+#if CONFIG_LOOP_RESTORATION
+  av1_loop_restoration_precal();
+#endif  // CONFIG_LOOP_RESTORATION
 #if CONFIG_ACCOUNTING
   pbi->acct_enabled = 1;
   aom_accounting_init(&pbi->accounting);
 #endif
+
   cm->error.setjmp = 0;
 
   aom_get_worker_interface()->init(&pbi->lf_worker);
@@ -208,7 +216,12 @@
   int idx;
   YV12_BUFFER_CONFIG *ref_buf = NULL;
 
-  // The set_reference control depends on the following setting in the
+  // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
+  // encoder is using the frame buffers for. This is just a stub to keep the
+  // aomenc --test-decode functionality working, and will be replaced in a
+  // later commit that adds AV1-specific controls for this functionality.
+
+  // (Yunqing) The set_reference control depends on the following setting in
   // encoder.
   //   cpi->lst_fb_idx = 0;
   // #if CONFIG_EXT_REFS
@@ -221,6 +234,9 @@
   //   cpi->gld_fb_idx = 1;
   //   cpi->alt_fb_idx = 2;
   // #endif  // CONFIG_EXT_REFS
+
+  // TODO(zoeliu): To revisit following code and reconsider what assumption we
+  // may take on the reference frame buffer virtual indexes
   if (ref_frame_flag == AOM_LAST_FLAG) {
     idx = cm->ref_frame_map[0];
 #if CONFIG_EXT_REFS
@@ -278,34 +294,26 @@
     // Current thread releases the holding of reference frame.
     decrease_ref_count(old_idx, frame_bufs, pool);
 
-    // Release the reference frame in reference map.
-    if ((mask & 1) && old_idx >= 0) {
-      decrease_ref_count(old_idx, frame_bufs, pool);
-    }
+    // Release the reference frame holding in the reference map for the decoding
+    // of the next frame.
+    if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
 
-// Current thread releases the holding of reference frame.
-#if CONFIG_EXT_REFS
-  for (; ref_index < REF_FRAMES; ++ref_index) {
-    const int old_idx = cm->ref_frame_map[ref_index];
-    decrease_ref_count(old_idx, frame_bufs, pool);
-    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
-  }
-#else
+  // Current thread releases the holding of reference frame.
   for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
     const int old_idx = cm->ref_frame_map[ref_index];
     decrease_ref_count(old_idx, frame_bufs, pool);
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
   }
-#endif  // CONFIG_EXT_REFS
+
   unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
 
-  // TODO(zoeliu): To investigate the ref frame buffer update for the scenario
-  //               of cm->frame_parellel_decode == 1 in CONFIG_EXT_REFS
+  // TODO(zoeliu): To fix the ref frame buffer update for the scenario of
+  //               cm->frame_parellel_decode == 1
   if (!cm->frame_parallel_decode || !cm->show_frame) {
     lock_buffer_pool(pool);
     --frame_bufs[cm->new_fb_idx].ref_count;
@@ -313,8 +321,10 @@
   }
 
   // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < 3; ref_index++)
-    cm->frame_refs[ref_index].idx = -1;
+  for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+    cm->frame_refs[ref_index].idx = INVALID_IDX;
+    cm->frame_refs[ref_index].buf = NULL;
+  }
 }
 
 int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
@@ -343,12 +353,16 @@
 
   pbi->ready_for_new_data = 0;
 
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+
   // Check if the previous frame was a frame without any references to it.
   // Release frame buffer if not decoding in frame parallel mode.
   if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0 &&
       frame_bufs[cm->new_fb_idx].ref_count == 0)
     pool->release_fb_cb(pool->cb_priv,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX) return AOM_CODEC_MEM_ERROR;
@@ -393,25 +407,17 @@
         // Current thread releases the holding of reference frame.
         decrease_ref_count(old_idx, frame_bufs, pool);
 
-        // Release the reference frame in reference map.
-        if ((mask & 1) && old_idx >= 0) {
-          decrease_ref_count(old_idx, frame_bufs, pool);
-        }
+        // Release the reference frame holding in the reference map for the
+        // decoding of the next frame.
+        if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
         ++ref_index;
       }
 
-// Current thread releases the holding of reference frame.
-#if CONFIG_EXT_REFS
-      for (; ref_index < REF_FRAMES; ++ref_index) {
-        const int old_idx = cm->ref_frame_map[ref_index];
-        decrease_ref_count(old_idx, frame_bufs, pool);
-      }
-#else
+      // Current thread releases the holding of reference frame.
       for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
         const int old_idx = cm->ref_frame_map[ref_index];
         decrease_ref_count(old_idx, frame_bufs, pool);
       }
-#endif  // CONFIG_EXT_REFS
       pbi->hold_ref_buf = 0;
     }
     // Release current frame.
@@ -427,7 +433,13 @@
 
   swap_frame_buffers(pbi);
 
-  aom_extend_frame_inner_borders(cm->frame_to_show);
+#if CONFIG_EXT_TILE
+  // For now, we only extend the frame borders when the whole frame is decoded.
+  // Later, if needed, extend the border for the decoded tile on the frame
+  // border.
+  if (pbi->dec_tile_row == -1 && pbi->dec_tile_col == -1)
+#endif  // CONFIG_EXT_TILE
+    aom_extend_frame_inner_borders(cm->frame_to_show);
 
   aom_clear_system_state();
 
@@ -435,7 +447,7 @@
     cm->last_show_frame = cm->show_frame;
 
 #if CONFIG_EXT_REFS
-    // NOTE: It is not supposed to ref to any frame not used as reference.
+    // NOTE: It is not supposed to ref to any frame not used as reference
     if (cm->is_reference_frame)
 #endif  // CONFIG_EXT_REFS
       cm->prev_frame = cm->cur_frame;
@@ -474,7 +486,6 @@
 int av1_get_raw_frame(AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd) {
   AV1_COMMON *const cm = &pbi->common;
   int ret = -1;
-
   if (pbi->ready_for_new_data == 1) return ret;
 
   pbi->ready_for_new_data = 1;
@@ -483,13 +494,21 @@
   if (!cm->show_frame) return ret;
 
   pbi->ready_for_new_data = 1;
-
   *sd = *cm->frame_to_show;
   ret = 0;
   aom_clear_system_state();
   return ret;
 }
 
+int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
+  AV1_COMMON *const cm = &pbi->common;
+
+  if (!cm->show_frame || !cm->frame_to_show) return -1;
+
+  *frame = *cm->frame_to_show;
+  return 0;
+}
+
 aom_codec_err_t av1_parse_superframe_index(const uint8_t *data, size_t data_sz,
                                            uint32_t sizes[8], int *count,
                                            aom_decrypt_cb decrypt_cb,
@@ -547,7 +566,7 @@
         sizes[i] = this_sz;
         frame_sz_sum += this_sz;
       }
-      sizes[i] = data_sz - index_sz - frame_sz_sum;
+      sizes[i] = (uint32_t)(data_sz - index_sz - frame_sz_sum);
       *count = frames;
     }
   }

diff --git a/av1/decoder/decoder.h b/av1/decoder/decoder.h
index 1b9142e..f50da1c 100644
--- a/av1/decoder/decoder.h
+++ b/av1/decoder/decoder.h

@@ -42,13 +42,13 @@
   aom_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
 #if CONFIG_PVQ
   /* forward transformed predicted image, a reference for PVQ */
   DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 #endif
 #if CONFIG_PALETTE
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
 #endif  // CONFIG_PALETTE
 } TileData;
 
@@ -58,17 +58,25 @@
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
 #if CONFIG_PVQ
   /* forward transformed predicted image, a reference for PVQ */
   DECLARE_ALIGNED(16, tran_low_t, pvq_ref_coeff[OD_TXSIZE_MAX * OD_TXSIZE_MAX]);
 #endif
 #if CONFIG_PALETTE
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
 #endif  // CONFIG_PALETTE
   struct aom_internal_error_info error_info;
 } TileWorkerData;
 
+typedef struct TileBufferDec {
+  const uint8_t *data;
+  size_t size;
+  const uint8_t *raw_data_end;  // The end of the raw tile buffer in the
+                                // bit stream.
+  int col;                      // only used with multi-threaded decoding
+} TileBufferDec;
+
 typedef struct AV1Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -90,7 +98,9 @@
   int num_tile_workers;
 
   TileData *tile_data;
-  int total_tiles;
+  int allocated_tiles;
+
+  TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
 
   AV1LfSync lf_row_sync;
 
@@ -101,6 +111,12 @@
   int inv_tile_order;
   int need_resync;   // wait for key/intra-only frame.
   int hold_ref_buf;  // hold the reference buffer.
+
+  int tile_size_bytes;
+#if CONFIG_EXT_TILE
+  int tile_col_size_bytes;
+  int dec_tile_row, dec_tile_col;
+#endif  // CONFIG_EXT_TILE
 #if CONFIG_ACCOUNTING
   int acct_enabled;
   Accounting accounting;
@@ -119,6 +135,8 @@
 
 int av1_get_raw_frame(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *sd);
 
+int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
 aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi,
                                        AOM_REFFRAME ref_frame_flag,
                                        YV12_BUFFER_CONFIG *sd);
@@ -168,12 +186,12 @@
                                        RefCntBuffer *frame_buf) {
   AV1_COMMON *const cm = &pbi->common;
   int i;
-  for (i = 0; i < REFS_PER_FRAME; ++i) {
+  for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
     RefBuffer *const ref_frame = &cm->frame_refs[i];
     if (ref_frame->idx == INVALID_IDX) continue;
     if (frame_buf == &cm->buffer_pool->frame_bufs[ref_frame->idx]) break;
   }
-  return (i < REFS_PER_FRAME);
+  return (i < INTER_REFS_PER_FRAME);
 }
 #endif  // CONFIG_EXT_REFS
 

diff --git a/av1/decoder/detokenize.c b/av1/decoder/detokenize.c
index 68d87cb..0f183f2 100644
--- a/av1/decoder/detokenize.c
+++ b/av1/decoder/detokenize.c

@@ -13,14 +13,14 @@
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 #endif
-
 #if !CONFIG_PVQ
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#endif  // CONFIG_ANS
 #include "av1/common/blockd.h"
 #include "av1/common/common.h"
 #include "av1/common/entropy.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
 #include "av1/common/idct.h"
-#endif
 
 #include "av1/decoder/detokenize.h"
 
@@ -51,38 +51,47 @@
 
 #if CONFIG_AOM_QM
 static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
-                        TX_SIZE tx_size, const int16_t *dq, int ctx,
-                        const int16_t *scan, const int16_t *nb, aom_reader *r,
+                        TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        int16_t *max_scan_line, aom_reader *r,
                         const qm_val_t *iqm[2][TX_SIZES])
 #else
 static int decode_coefs(MACROBLOCKD *xd, PLANE_TYPE type, tran_low_t *dqcoeff,
-                        TX_SIZE tx_size, const int16_t *dq, int ctx,
-                        const int16_t *scan, const int16_t *nb, aom_reader *r)
+                        TX_SIZE tx_size, TX_TYPE tx_type, const int16_t *dq,
+#if CONFIG_NEW_QUANT
+                        dequant_val_type_nuq *dq_val,
+#endif  // CONFIG_NEW_QUANT
+                        int ctx, const int16_t *scan, const int16_t *nb,
+                        int16_t *max_scan_line, aom_reader *r)
 #endif
 {
   FRAME_COUNTS *counts = xd->counts;
-  const int max_eob = 1 << (tx_size_1d_log2[tx_size] * 2);
   FRAME_CONTEXT *const fc = xd->fc;
+  const int max_eob = tx_size_2d[tx_size];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
 #if CONFIG_AOM_QM
   const qm_val_t *iqmatrix = iqm[!ref][tx_size];
 #endif
   int band, c = 0;
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
   aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      fc->coef_probs[tx_size][type][ref];
+      fc->coef_probs[tx_size_ctx][type][ref];
   const aom_prob *prob;
 #if CONFIG_EC_MULTISYMBOL
   aom_cdf_prob(*coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      fc->coef_cdfs[tx_size][type][ref];
+      fc->coef_cdfs[tx_size_ctx][type][ref];
   aom_cdf_prob(*cdf)[ENTROPY_TOKENS];
 #endif  // CONFIG_EC_MULTISYMBOL
   unsigned int(*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int(*eob_branch_count)[COEFF_CONTEXTS];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
   const uint8_t *band_translate = get_band_translate(tx_size);
-  const int dq_shift = (tx_size == TX_32X32);
+  int dq_shift;
   int v, token;
   int16_t dqv = dq[0];
+#if CONFIG_NEW_QUANT
+  const tran_low_t *dqv_val = &dq_val[0][0];
+#endif  // CONFIG_NEW_QUANT
   const uint8_t *cat1_prob;
   const uint8_t *cat2_prob;
   const uint8_t *cat3_prob;
@@ -91,8 +100,8 @@
   const uint8_t *cat6_prob;
 
   if (counts) {
-    coef_counts = counts->coef[tx_size][type][ref];
-    eob_branch_count = counts->eob_branch[tx_size][type][ref];
+    coef_counts = counts->coef[tx_size_ctx][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size_ctx][type][ref];
   }
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -129,6 +138,8 @@
   cat6_prob = av1_cat6_prob;
 #endif
 
+  dq_shift = get_tx_scale(xd, tx_type, tx_size);
+
   while (c < max_eob) {
     int val = -1;
     band = *band_translate++;
@@ -139,6 +150,10 @@
       break;
     }
 
+#if CONFIG_NEW_QUANT
+    dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
+
     while (!aom_read(r, prob[ZERO_CONTEXT_NODE], ACCT_STR)) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
@@ -148,8 +163,13 @@
       ctx = get_coef_context(nb, token_cache, c);
       band = *band_translate++;
       prob = coef_probs[band][ctx];
+#if CONFIG_NEW_QUANT
+      dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
     }
 
+    *max_scan_line = AOMMAX(*max_scan_line, scan[c]);
+
 #if CONFIG_EC_MULTISYMBOL
     cdf = &coef_cdfs[band][ctx];
     token = ONE_TOKEN +
@@ -176,7 +196,7 @@
         val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
         break;
       case CATEGORY6_TOKEN: {
-        const int skip_bits = TX_SIZES - 1 - tx_size;
+        const int skip_bits = TX_SIZES - 1 - txsize_sqr_up_map[tx_size];
         const uint8_t *cat6p = cat6_prob + skip_bits;
 #if CONFIG_AOM_HIGHBITDEPTH
         switch (xd->bd) {
@@ -194,8 +214,7 @@
 #else
         val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, r);
 #endif
-        break;
-      }
+      } break;
     }
 #else  // CONFIG_EC_MULTISYMBOL
     if (!aom_read(r, prob[ONE_CONTEXT_NODE], ACCT_STR)) {
@@ -226,7 +245,7 @@
           val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
           break;
         case CATEGORY6_TOKEN: {
-          const int skip_bits = TX_SIZES - 1 - tx_size;
+          const int skip_bits = TX_SIZES - 1 - txsize_sqr_up_map[tx_size];
           const uint8_t *cat6p = cat6_prob + skip_bits;
 #if CONFIG_AOM_HIGHBITDEPTH
           switch (xd->bd) {
@@ -249,11 +268,17 @@
       }
     }
 #endif  // CONFIG_EC_MULTISYMBOL
+#if CONFIG_NEW_QUANT
+    v = av1_dequant_abscoeff_nuq(val, dqv, dqv_val);
+    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
+#else
 #if CONFIG_AOM_QM
     dqv = ((iqmatrix[scan[c]] * (int)dqv) + (1 << (AOM_QM_BITS - 1))) >>
           AOM_QM_BITS;
 #endif
     v = (val * dqv) >> dq_shift;
+#endif  // CONFIG_NEW_QUANT
+
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_AOM_HIGHBITDEPTH
     dqcoeff[scan[c]] =
@@ -304,21 +329,32 @@
 }
 #endif  // CONFIG_PALETTE
 
-int av1_decode_block_tokens(MACROBLOCKD *xd, int plane, const SCAN_ORDER *sc,
-                            int x, int y, TX_SIZE tx_size, aom_reader *r,
-                            int seg_id) {
+int av1_decode_block_tokens(MACROBLOCKD *const xd, int plane,
+                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int16_t *max_scan_line,
+                            aom_reader *r, int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant[seg_id];
   const int ctx =
       get_entropy_context(tx_size, pd->above_context + x, pd->left_context + y);
+#if CONFIG_NEW_QUANT
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  int dq =
+      get_dq_profile_from_ctx(xd->qindex[seg_id], ctx, ref, pd->plane_type);
+#endif  //  CONFIG_NEW_QUANT
+
 #if CONFIG_AOM_QM
-  const int eob =
-      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, dequant, ctx,
-                   sc->scan, sc->neighbors, r, pd->seg_iqmatrix[seg_id]);
-#else
   const int eob = decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size,
-                               dequant, ctx, sc->scan, sc->neighbors, r);
-#endif
+                               tx_type, dequant, ctx, sc->scan, sc->neighbors,
+                               max_scan_line, r, pd->seg_iqmatrix[seg_id]);
+#else
+  const int eob =
+      decode_coefs(xd, pd->plane_type, pd->dqcoeff, tx_size, tx_type, dequant,
+#if CONFIG_NEW_QUANT
+                   pd->seg_dequant_nuq[seg_id][dq],
+#endif  // CONFIG_NEW_QUANT
+                   ctx, sc->scan, sc->neighbors, max_scan_line, r);
+#endif  // CONFIG_AOM_QM
   av1_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }

diff --git a/av1/decoder/detokenize.h b/av1/decoder/detokenize.h
index 569580c..ec68665 100644
--- a/av1/decoder/detokenize.h
+++ b/av1/decoder/detokenize.h

@@ -13,8 +13,10 @@
 #ifndef AV1_DECODER_DETOKENIZE_H_
 #define AV1_DECODER_DETOKENIZE_H_
 
-#include "aom_dsp/bitreader.h"
 #include "av1/decoder/decoder.h"
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#endif  // CONFIG_ANS
 #include "av1/common/scan.h"
 
 #ifdef __cplusplus
@@ -25,8 +27,14 @@
 void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
 #endif  // CONFIG_PALETTE
 
-int av1_decode_block_tokens(MACROBLOCKD *xd, int plane, const SCAN_ORDER *sc,
-                            int x, int y, TX_SIZE tx_size, aom_reader *r,
+int av1_decode_block_tokens(MACROBLOCKD *const xd, int plane,
+                            const SCAN_ORDER *sc, int x, int y, TX_SIZE tx_size,
+                            TX_TYPE tx_type, int16_t *max_scan_line,
+#if CONFIG_ANS
+                            struct AnsDecoder *const r,
+#else
+                            aom_reader *r,
+#endif  // CONFIG_ANS
                             int seg_id);
 
 #ifdef __cplusplus

diff --git a/av1/decoder/dsubexp.c b/av1/decoder/dsubexp.c
index 5171f11..ee6a295 100644
--- a/av1/decoder/dsubexp.c
+++ b/av1/decoder/dsubexp.c

@@ -80,3 +80,15 @@
     *p = (aom_prob)inv_remap_prob(delp, *p);
   }
 }
+
+#if CONFIG_GLOBAL_MOTION
+int aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits) {
+  if (aom_read_bit(r, ACCT_STR_NAME)) {
+    int s = aom_read_bit(r, ACCT_STR_NAME);
+    int x = aom_read_literal(r, mag_bits, ACCT_STR_NAME) + 1;
+    return (s > 0 ? -x : x);
+  } else {
+    return 0;
+  }
+}
+#endif  // CONFIG_GLOBAL_MOTION
\ No newline at end of file

diff --git a/av1/decoder/dsubexp.h b/av1/decoder/dsubexp.h
index 1ddc27a..60aa7df 100644
--- a/av1/decoder/dsubexp.h
+++ b/av1/decoder/dsubexp.h

@@ -29,5 +29,11 @@
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-
+#if CONFIG_GLOBAL_MOTION
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+int aom_read_primitive_symmetric(aom_reader *r, unsigned int mag_bits);
+#endif  // CONFIG_GLOBAL_MOTION
 #endif  // AV1_DECODER_DSUBEXP_H_

diff --git a/av1/decoder/dthread.c b/av1/decoder/dthread.c
index 41bb8c7..e0ed7b0 100644
--- a/av1/decoder/dthread.c
+++ b/av1/decoder/dthread.c

@@ -182,7 +182,7 @@
          (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
   dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
   dst_cm->lf.filter_level = src_cm->lf.filter_level;
-  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_FRAMES);
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, TOTAL_REFS_PER_FRAME);
   memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
   dst_cm->seg = src_cm->seg;
   memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,

diff --git a/av1/encoder/aq_complexity.c b/av1/encoder/aq_complexity.c
index 8a3b8b3..5c4a5e3 100644
--- a/av1/encoder/aq_complexity.c
+++ b/av1/encoder/aq_complexity.c

@@ -116,8 +116,6 @@
   const AV1_COMMON *const cm = &cpi->common;
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
   const int xmis = AOMMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
   const int ymis = AOMMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
   int x, y;
@@ -129,8 +127,8 @@
   } else {
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
     // It is converted to bits * 256 units.
-    const int target_rate =
-        (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh);
+    const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
+                            (cm->mib_size * cm->mib_size);
     double logvar;
     double low_var_thresh;
     const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);

diff --git a/av1/encoder/aq_cyclicrefresh.c b/av1/encoder/aq_cyclicrefresh.c
index 2337abd..bcf11a7 100644
--- a/av1/encoder/aq_cyclicrefresh.c
+++ b/av1/encoder/aq_cyclicrefresh.c

@@ -64,13 +64,13 @@
 
   cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
   if (cr->map == NULL) {
-    aom_free(cr);
+    av1_cyclic_refresh_free(cr);
     return NULL;
   }
   last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
   cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
   if (cr->last_coded_q_map == NULL) {
-    aom_free(cr);
+    av1_cyclic_refresh_free(cr);
     return NULL;
   }
   assert(MAXQ <= 255);
@@ -320,13 +320,15 @@
   double fraction_low = 0.0;
   int low_content_frame = 0;
 
-  MODE_INFO **mi = cm->mi_grid_visible;
+  MODE_INFO **mi;
   RATE_CONTROL *const rc = &cpi->rc;
   const int rows = cm->mi_rows, cols = cm->mi_cols;
   int cnt1 = 0, cnt2 = 0;
   int force_gf_refresh = 0;
 
   for (mi_row = 0; mi_row < rows; mi_row++) {
+    mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
     for (mi_col = 0; mi_col < cols; mi_col++) {
       int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0
                             ? mi[0]->mbmi.mv[0].as_mv.row
@@ -345,7 +347,6 @@
       // Accumulate low_content_frame.
       if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
     }
-    mi += 8;
   }
 
   // For video conference clips, if the background has high motion in current
@@ -391,8 +392,8 @@
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
-  sb_cols = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
-  sb_rows = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
+  sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size;
+  sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
@@ -407,8 +408,8 @@
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * MAX_MIB_SIZE;
-    int mi_col = sb_col_index * MAX_MIB_SIZE;
+    int mi_row = sb_row_index * cm->mib_size;
+    int mi_col = sb_col_index * cm->mib_size;
     int qindex_thresh =
         cpi->oxcf.content == AOM_CONTENT_SCREEN
             ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
@@ -416,11 +417,9 @@
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;
-    // Loop through all 8x8 blocks in superblock and update map.
-    xmis =
-        AOMMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
-    ymis =
-        AOMMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    // Loop through all MI blocks in superblock and update map.
+    xmis = AOMMIN(cm->mi_cols - mi_col, cm->mib_size);
+    ymis = AOMMIN(cm->mi_rows - mi_row, cm->mib_size);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;

diff --git a/av1/encoder/aq_variance.c b/av1/encoder/aq_variance.c
index 5da7193..01528ec 100644
--- a/av1/encoder/aq_variance.c
+++ b/av1/encoder/aq_variance.c

@@ -33,9 +33,10 @@
 
 #define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
 
-DECLARE_ALIGNED(16, static const uint8_t, av1_64_zeros[64]) = { 0 };
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
 #if CONFIG_AOM_HIGHBITDEPTH
-DECLARE_ALIGNED(16, static const uint16_t, av1_highbd_64_zeros[64]) = { 0 };
+DECLARE_ALIGNED(16, static const uint16_t,
+                av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
 #endif
 
 unsigned int av1_vaq_segment_id(int energy) {
@@ -51,6 +52,8 @@
   if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    cpi->vaq_refresh = 1;
+
     av1_enable_segmentation(seg);
     av1_clearall_segfeatures(seg);
 
@@ -154,16 +157,16 @@
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                           CONVERT_TO_BYTEPTR(av1_highbd_64_zeros), 0, bw, bh,
+                           CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, bw, bh,
                            &sse, &avg);
       sse >>= 2 * (xd->bd - 8);
       avg >>= (xd->bd - 8);
     } else {
-      aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_64_zeros, 0,
+      aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
                   bw, bh, &sse, &avg);
     }
 #else
-    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_64_zeros, 0,
+    aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, av1_all_zeros, 0,
                 bw, bh, &sse, &avg);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
@@ -173,14 +176,14 @@
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       var =
           cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                             CONVERT_TO_BYTEPTR(av1_highbd_64_zeros), 0, &sse);
+                             CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse);
     } else {
       var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                               av1_64_zeros, 0, &sse);
+                               av1_all_zeros, 0, &sse);
     }
 #else
     var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                             av1_64_zeros, 0, &sse);
+                             av1_all_zeros, 0, &sse);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
     return ((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
   }

diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 65624f3..3b66fa7 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c

@@ -10,8 +10,8 @@
  */
 
 #include <assert.h>
-#include <stdio.h>
 #include <limits.h>
+#include <stdio.h>
 
 #include "aom/aom_encoder.h"
 #include "aom_dsp/bitwriter_buffer.h"
@@ -39,8 +39,11 @@
 #include "av1/common/seg_common.h"
 #include "av1/common/tile_common.h"
 
-#include "av1/encoder/cost.h"
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#endif  // CONFIG_ANS
 #include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
 #include "av1/encoder/encodemv.h"
 #include "av1/encoder/mcomp.h"
 #include "av1/encoder/segmentation.h"
@@ -52,65 +55,119 @@
 
 static struct av1_token intra_mode_encodings[INTRA_MODES];
 static struct av1_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES
+static const struct av1_token ext_partition_encodings[EXT_PARTITION_TYPES] = {
+  { 0, 1 },  { 4, 3 },  { 12, 4 }, { 7, 3 },
+  { 10, 4 }, { 11, 4 }, { 26, 5 }, { 27, 5 }
+};
+#endif
 static struct av1_token partition_encodings[PARTITION_TYPES];
 #if !CONFIG_REF_MV
 static struct av1_token inter_mode_encodings[INTER_MODES];
 #endif
-
-#if CONFIG_PALETTE
-static const struct av1_token palette_size_encodings[] = {
-  { 0, 1 }, { 2, 2 }, { 6, 3 }, { 14, 4 }, { 30, 5 }, { 62, 6 }, { 63, 6 },
-};
+#if CONFIG_EXT_INTER
 static const struct av1_token
-    palette_color_encodings[PALETTE_MAX_SIZE - 1][PALETTE_MAX_SIZE] = {
-      { { 0, 1 }, { 1, 1 } },                                  // 2 colors
-      { { 0, 1 }, { 2, 2 }, { 3, 2 } },                        // 3 colors
-      { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } },              // 4 colors
-      { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 14, 4 }, { 15, 4 } },  // 5 colors
-      { { 0, 1 },
-        { 2, 2 },
-        { 6, 3 },
-        { 14, 4 },
-        { 30, 5 },
-        { 31, 5 } },  // 6 colors
-      { { 0, 1 },
-        { 2, 2 },
-        { 6, 3 },
-        { 14, 4 },
-        { 30, 5 },
-        { 62, 6 },
-        { 63, 6 } },  // 7 colors
-      { { 0, 1 },
-        { 2, 2 },
-        { 6, 3 },
-        { 14, 4 },
-        { 30, 5 },
-        { 62, 6 },
-        { 126, 7 },
-        { 127, 7 } },  // 8 colors
+    inter_compound_mode_encodings[INTER_COMPOUND_MODES] = {
+      { 2, 2 },  { 50, 6 }, { 51, 6 }, { 24, 5 }, { 52, 6 },
+      { 53, 6 }, { 54, 6 }, { 55, 6 }, { 0, 1 },  { 7, 3 }
     };
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_PALETTE
+static struct av1_token palette_size_encodings[PALETTE_MAX_SIZE - 1];
+static struct av1_token palette_color_encodings[PALETTE_MAX_SIZE - 1]
+                                               [PALETTE_MAX_SIZE];
 #endif  // CONFIG_PALETTE
+static const struct av1_token tx_size_encodings[MAX_TX_DEPTH][TX_SIZES] = {
+  { { 0, 1 }, { 1, 1 } },                      // Max tx_size is 8X8
+  { { 0, 1 }, { 2, 2 }, { 3, 2 } },            // Max tx_size is 16X16
+  { { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 } },  // Max tx_size is 32X32
+};
 
-#if CONFIG_MOTION_VAR
-static struct av1_token motion_mode_encodings[MOTION_MODES];
-#endif  // CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+  int l = get_unsigned_bits(n);
+  int m = (1 << l) - n;
+  if (l == 0) return;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_literal(w, (v - m) & 1, 1);
+  }
+}
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+
+#if CONFIG_EXT_TX
+static struct av1_token ext_tx_inter_encodings[EXT_TX_SETS_INTER][TX_TYPES];
+static struct av1_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES];
+#else
 static struct av1_token ext_tx_encodings[TX_TYPES];
-
+#endif  // CONFIG_EXT_TX
+#if CONFIG_GLOBAL_MOTION
+static struct av1_token global_motion_types_encodings[GLOBAL_MOTION_TYPES];
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_EXT_INTRA
+static struct av1_token intra_filter_encodings[INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+static struct av1_token interintra_mode_encodings[INTERINTRA_MODES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+static struct av1_token motion_mode_encodings[MOTION_MODES];
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_LOOP_RESTORATION
+static struct av1_token switchable_restore_encodings[RESTORE_SWITCHABLE_TYPES];
+#endif  // CONFIG_LOOP_RESTORATION
 static void write_uncompressed_header(AV1_COMP *cpi,
                                       struct aom_write_bit_buffer *wb);
-static size_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data);
 
-void av1_encode_token_init() {
+void av1_encode_token_init(void) {
+#if CONFIG_EXT_TX || CONFIG_PALETTE
+  int s;
+#endif  // CONFIG_EXT_TX || CONFIG_PALETTE
+#if CONFIG_EXT_TX
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    av1_tokens_from_tree(ext_tx_inter_encodings[s], av1_ext_tx_inter_tree[s]);
+  }
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    av1_tokens_from_tree(ext_tx_intra_encodings[s], av1_ext_tx_intra_tree[s]);
+  }
+#else
+  av1_tokens_from_tree(ext_tx_encodings, av1_ext_tx_tree);
+#endif  // CONFIG_EXT_TX
   av1_tokens_from_tree(intra_mode_encodings, av1_intra_mode_tree);
   av1_tokens_from_tree(switchable_interp_encodings, av1_switchable_interp_tree);
   av1_tokens_from_tree(partition_encodings, av1_partition_tree);
 #if !CONFIG_REF_MV
   av1_tokens_from_tree(inter_mode_encodings, av1_inter_mode_tree);
 #endif
-#if CONFIG_MOTION_VAR
+
+#if CONFIG_PALETTE
+  av1_tokens_from_tree(palette_size_encodings, av1_palette_size_tree);
+  for (s = 0; s < PALETTE_MAX_SIZE - 1; ++s) {
+    av1_tokens_from_tree(palette_color_encodings[s], av1_palette_color_tree[s]);
+  }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_EXT_INTRA
+  av1_tokens_from_tree(intra_filter_encodings, av1_intra_filter_tree);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+  av1_tokens_from_tree(interintra_mode_encodings, av1_interintra_mode_tree);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   av1_tokens_from_tree(motion_mode_encodings, av1_motion_mode_tree);
-#endif  // CONFIG_MOTION_VAR
-  av1_tokens_from_tree(ext_tx_encodings, av1_ext_tx_tree);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_GLOBAL_MOTION
+  av1_tokens_from_tree(global_motion_types_encodings,
+                       av1_global_motion_types_tree);
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_LOOP_RESTORATION
+  av1_tokens_from_tree(switchable_restore_encodings,
+                       av1_switchable_restore_tree);
+#endif  // CONFIG_LOOP_RESTORATION
+
 #if CONFIG_DAALA_EC
   /* This hack is necessary when CONFIG_EXT_INTERP is enabled because the five
       SWITCHABLE_FILTERS are not consecutive, e.g., 0, 1, 2, 3, 4, when doing
@@ -136,14 +193,35 @@
 }
 #endif
 
+#if CONFIG_EXT_INTER
+static void write_interintra_mode(aom_writer *w, INTERINTRA_MODE mode,
+                                  const aom_prob *probs) {
+  av1_write_token(w, av1_interintra_mode_tree, probs,
+                  &interintra_mode_encodings[mode]);
+}
+#endif  // CONFIG_EXT_INTER
+
 static void write_inter_mode(AV1_COMMON *cm, aom_writer *w,
-                             PREDICTION_MODE mode, const int16_t mode_ctx) {
+                             PREDICTION_MODE mode,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                             int is_compound,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                             const int16_t mode_ctx) {
 #if CONFIG_REF_MV
   const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
   const aom_prob newmv_prob = cm->fc->newmv_prob[newmv_ctx];
+#if CONFIG_EXT_INTER
+  aom_write(w, mode != NEWMV && mode != NEWFROMNEARMV, newmv_prob);
+
+  if (!is_compound && (mode == NEWMV || mode == NEWFROMNEARMV))
+    aom_write(w, mode == NEWFROMNEARMV, cm->fc->new2mv_prob);
+
+  if (mode != NEWMV && mode != NEWFROMNEARMV) {
+#else
   aom_write(w, mode != NEWMV, newmv_prob);
 
   if (mode != NEWMV) {
+#endif  // CONFIG_EXT_INTER
     const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
     const aom_prob zeromv_prob = cm->fc->zeromv_prob[zeromv_ctx];
 
@@ -151,6 +229,7 @@
       assert(mode == ZEROMV);
       return;
     }
+
     aom_write(w, mode != ZEROMV, zeromv_prob);
 
     if (mode != ZEROMV) {
@@ -220,22 +299,24 @@
 }
 #endif
 
-#if CONFIG_MOTION_VAR
-static void write_motion_mode(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
-                              aom_writer *w) {
-  if (is_motion_variation_allowed(mbmi))
-    av1_write_token(w, av1_motion_mode_tree,
-                    cm->fc->motion_mode_prob[mbmi->sb_type],
-                    &motion_mode_encodings[mbmi->motion_mode]);
+#if CONFIG_EXT_INTER
+static void write_inter_compound_mode(AV1_COMMON *cm, aom_writer *w,
+                                      PREDICTION_MODE mode,
+                                      const int16_t mode_ctx) {
+  const aom_prob *const inter_compound_probs =
+      cm->fc->inter_compound_mode_probs[mode_ctx];
+
+  assert(is_inter_compound_mode(mode));
+  av1_write_token(w, av1_inter_compound_mode_tree, inter_compound_probs,
+                  &inter_compound_mode_encodings[INTER_COMPOUND_OFFSET(mode)]);
 }
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_EXT_INTER
 
 static void encode_unsigned_max(struct aom_write_bit_buffer *wb, int data,
                                 int max) {
   aom_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-#if !CONFIG_EC_ADAPT
 static void prob_diff_update(const aom_tree_index *tree,
                              aom_prob probs[/*n - 1*/],
                              const unsigned int counts[/*n - 1*/], int n,
@@ -251,6 +332,7 @@
     av1_cond_prob_diff_update(w, &probs[i], branch_ct[i], probwt);
 }
 
+#if !CONFIG_EC_ADAPT
 static int prob_diff_update_savings(const aom_tree_index *tree,
                                     aom_prob probs[/*n - 1*/],
                                     const unsigned int counts[/*n - 1*/], int n,
@@ -270,18 +352,87 @@
 }
 #endif
 
+#if CONFIG_VAR_TX
+static void write_tx_size_vartx(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+                                const MB_MODE_INFO *mbmi, TX_SIZE tx_size,
+                                int depth, int blk_row, int blk_col,
+                                aom_writer *w) {
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (depth == MAX_VARTX_DEPTH) {
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+    return;
+  }
+
+  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
+    aom_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    aom_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+
+    if (tx_size == TX_8X8) {
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+      write_tx_size_vartx(cm, xd, mbmi, sub_txs, depth + 1, offsetr, offsetc,
+                          w);
+    }
+  }
+}
+
+static void update_txfm_partition_probs(AV1_COMMON *cm, aom_writer *w,
+                                        FRAME_COUNTS *counts, int probwt) {
+  int k;
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    av1_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
+                              counts->txfm_partition[k], probwt);
+}
+#endif
+
 static void write_selected_tx_size(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                    aom_writer *w) {
-  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const aom_prob *const tx_probs =
-      get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
-  aom_write(w, tx_size != TX_4X4, tx_probs[TX_4X4]);
-  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    aom_write(w, tx_size != TX_8X8, tx_probs[TX_8X8]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      aom_write(w, tx_size != TX_16X16, tx_probs[TX_16X16]);
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  // For sub8x8 blocks the tx_size symbol does not need to be sent
+  if (bsize >= BLOCK_8X8) {
+    const TX_SIZE tx_size = mbmi->tx_size;
+    const int is_inter = is_inter_block(mbmi);
+    const int tx_size_ctx = get_tx_size_context(xd);
+    const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                     : intra_tx_size_cat_lookup[bsize];
+    const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+    const int depth = tx_size_to_depth(coded_tx_size);
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+    assert(
+        IMPLIES(is_rect_tx(tx_size), tx_size == max_txsize_rect_lookup[bsize]));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+    av1_write_token(w, av1_tx_size_tree[tx_size_cat],
+                    cm->fc->tx_size_probs[tx_size_cat][tx_size_ctx],
+                    &tx_size_encodings[tx_size_cat][depth]);
   }
 }
 
@@ -306,9 +457,38 @@
   for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
     av1_cond_prob_diff_update(w, &cm->fc->drl_prob[i], counts->drl_mode[i],
                               probwt);
+#if CONFIG_EXT_INTER
+  av1_cond_prob_diff_update(w, &cm->fc->new2mv_prob, counts->new2mv_mode,
+                            probwt);
+#endif  // CONFIG_EXT_INTER
 }
 #endif
 
+#if CONFIG_EXT_INTER
+static void update_inter_compound_mode_probs(AV1_COMMON *cm, int probwt,
+                                             aom_writer *w) {
+  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i;
+  int savings = 0;
+  int do_update = 0;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    savings += prob_diff_update_savings(
+        av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
+        cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt);
+  }
+  do_update = savings > savings_thresh;
+  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+      prob_diff_update(
+          av1_inter_compound_mode_tree, cm->fc->inter_compound_mode_probs[i],
+          cm->counts.inter_compound_mode[i], INTER_COMPOUND_MODES, probwt, w);
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
+
 static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                       int segment_id, const MODE_INFO *mi, aom_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
@@ -391,6 +571,64 @@
 }
 #endif
 
+#if CONFIG_EXT_TX
+static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
+  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int s;
+#if CONFIG_TILE_GROUPS
+  const int probwt = cm->num_tg;
+#else
+  const int probwt = 1;
+#endif
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+      savings += prob_diff_update_savings(
+          av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
+          cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], probwt);
+    }
+    do_update = savings > savings_thresh;
+    aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        prob_diff_update(
+            av1_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
+            cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s], probwt, w);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+      for (j = 0; j < INTRA_MODES; ++j)
+        savings += prob_diff_update_savings(
+            av1_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
+            cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s], probwt);
+    }
+    do_update = savings > savings_thresh;
+    aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          prob_diff_update(av1_ext_tx_intra_tree[s],
+                           cm->fc->intra_ext_tx_prob[s][i][j],
+                           cm->counts.intra_ext_tx[s][i][j],
+                           num_ext_tx_set_intra[s], probwt, w);
+      }
+    }
+  }
+}
+
+#else
 #if !CONFIG_EC_ADAPT
 static void update_ext_tx_probs(AV1_COMMON *cm, aom_writer *w) {
   const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
@@ -436,13 +674,13 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 #endif
-
 #if CONFIG_PALETTE
-static void pack_palette_tokens(aom_writer *w, TOKENEXTRA **tp, int n,
+static void pack_palette_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
                                 int num) {
   int i;
-  TOKENEXTRA *p = *tp;
+  const TOKENEXTRA *p = *tp;
 
   for (i = 0; i < num; ++i) {
     av1_write_token(w, av1_palette_color_tree[n - 2], p->context_tree,
@@ -453,12 +691,42 @@
   *tp = p;
 }
 #endif  // CONFIG_PALETTE
-
 #if !CONFIG_PVQ
-static void pack_mb_tokens(aom_writer *w, TOKENEXTRA **tp,
+#if CONFIG_SUPERTX
+static void update_supertx_probs(AV1_COMMON *cm, int probwt, aom_writer *w) {
+  const int savings_thresh = av1_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             av1_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int savings = 0;
+  int do_update = 0;
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    for (j = 1; j < TX_SIZES; ++j) {
+      savings += av1_cond_prob_diff_update_savings(
+          &cm->fc->supertx_prob[i][j], cm->counts.supertx[i][j], probwt);
+    }
+  }
+  do_update = savings > savings_thresh;
+  aom_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = 1; j < TX_SIZES; ++j) {
+        av1_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
+                                  cm->counts.supertx[i][j], probwt);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+static void pack_mb_tokens(aom_writer *w, const TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop,
-                           aom_bit_depth_t bit_depth, const TX_SIZE tx) {
-  TOKENEXTRA *p = *tp;
+                           aom_bit_depth_t bit_depth, const TX_SIZE tx_size,
+                           TOKEN_STATS *token_stats) {
+  const TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+  int count = 0;
+  const int seg_eob = tx_size_2d[tx_size];
+#endif
 #if CONFIG_AOM_HIGHBITDEPTH
   const av1_extra_bit *const extra_bits_table =
       (bit_depth == AOM_BITS_12)
@@ -470,7 +738,7 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
   while (p < stop && p->token != EOSB_TOKEN) {
-    const uint8_t token = p->token;
+    const int token = p->token;
     aom_tree_index index = 0;
 #if !CONFIG_EC_MULTISYMBOL
     const struct av1_token *const coef_encoding = &av1_coef_encodings[token];
@@ -480,10 +748,13 @@
     const av1_extra_bit *const extra_bits = &extra_bits_table[token];
 
 #if CONFIG_EC_MULTISYMBOL
-    if (!p->skip_eob_node) aom_write(w, token != EOB_TOKEN, p->context_tree[0]);
+    /* skip one or two nodes */
+    if (!p->skip_eob_node)
+      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
 
     if (token != EOB_TOKEN) {
-      aom_write(w, token != ZERO_TOKEN, p->context_tree[1]);
+      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
+
       if (token != ZERO_TOKEN) {
         aom_write_symbol(w, token - ONE_TOKEN, *p->token_cdf,
                          CATEGORY6_TOKEN - ONE_TOKEN + 1);
@@ -491,34 +762,26 @@
     }
 #else
     /* skip one or two nodes */
-    if (p->skip_eob_node) {
+    if (p->skip_eob_node)
       coef_length -= p->skip_eob_node;
-      index = 2 * p->skip_eob_node;
-    }
+    else
+      aom_write_record(w, token != EOB_TOKEN, p->context_tree[0], token_stats);
 
-    // TODO(jbb): expanding this can lead to big gains.  It allows
-    // much better branch prediction and would enable us to avoid numerous
-    // lookups and compares.
+    if (token != EOB_TOKEN) {
+      aom_write_record(w, token != ZERO_TOKEN, p->context_tree[1], token_stats);
 
-    // If we have a token that's in the constrained set, the coefficient tree
-    // is split into two treed writes.  The first treed write takes care of the
-    // unconstrained nodes.  The second treed write takes care of the
-    // constrained nodes.
-    if (token >= TWO_TOKEN && token < EOB_TOKEN) {
-      const int unconstrained_len = UNCONSTRAINED_NODES - p->skip_eob_node;
-      const int unconstrained_bits =
-          coef_value >> (coef_length - unconstrained_len);
-      // Unconstrained nodes.
-      aom_write_tree_bits(w, av1_coef_tree, p->context_tree, unconstrained_bits,
-                          unconstrained_len, index);
-      coef_value &= (1 << (coef_length - unconstrained_len)) - 1;
-      // Constrained nodes.
-      aom_write_tree(w, av1_coef_con_tree,
-                     av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
-                     coef_value, coef_length - unconstrained_len, 0);
-    } else {
-      aom_write_tree_bits(w, av1_coef_tree, p->context_tree, coef_value,
-                          coef_length, index);
+      if (token != ZERO_TOKEN) {
+        aom_write_record(w, token != ONE_TOKEN, p->context_tree[2],
+                         token_stats);
+
+        if (token != ONE_TOKEN) {
+          const int unconstrained_len = UNCONSTRAINED_NODES - p->skip_eob_node;
+          aom_write_tree_record(
+              w, av1_coef_con_tree,
+              av1_pareto8_full[p->context_tree[PIVOT_NODE] - 1], coef_value,
+              coef_length - unconstrained_len, 0, token_stats);
+        }
+      }
     }
 #endif  // CONFIG_EC_MULTISYMBOL
 
@@ -527,8 +790,9 @@
       const int bit_string_length = extra_bits->len;  // Length of extra bits to
                                                       // be written excluding
                                                       // the sign bit.
-      int skip_bits =
-          (extra_bits->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
+      int skip_bits = (extra_bits->base_val == CAT6_MIN_VAL)
+                          ? TX_SIZES - 1 - txsize_sqr_up_map[tx_size]
+                          : 0;
 
       if (bit_string_length > 0) {
         const unsigned char *pb = extra_bits->prob;
@@ -543,18 +807,67 @@
             --skip_bits;
             assert(!bb);
           } else {
-            aom_write(w, bb, pb[index]);
+            aom_write_record(w, bb, pb[index], token_stats);
           }
         }
       }
 
-      aom_write_bit(w, bit_string & 1);
+      aom_write_bit_record(w, bit_string & 1, token_stats);
     }
     ++p;
+
+#if CONFIG_VAR_TX
+    ++count;
+    if (token == EOB_TOKEN || count == seg_eob) break;
+#endif
   }
 
   *tp = p;
 }
+#endif  // !CONFIG_PVG
+#if CONFIG_VAR_TX
+static void pack_txb_tokens(aom_writer *w, const TOKENEXTRA **tp,
+                            const TOKENEXTRA *const tok_end, MACROBLOCKD *xd,
+                            MB_MODE_INFO *mbmi, int plane,
+                            BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
+                            int block, int blk_row, int blk_col,
+                            TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE plane_tx_size;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size, token_stats);
+  } else {
+    const int bsl = block_size_wide[bsize] >> (tx_size_wide_log2[0] + 1);
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + (i >> 1) * bsl;
+      const int offsetc = blk_col + (i & 0x01) * bsl;
+      const TX_SIZE sub_txs = tx_size - 1;
+      const int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      pack_txb_tokens(w, tp, tok_end, xd, mbmi, plane, plane_bsize, bit_depth,
+                      block, offsetr, offsetc, sub_txs, token_stats);
+      block += step;
+    }
+  }
+}
 #endif
 
 static void write_segment_id(aom_writer *w, const struct segmentation *seg,
@@ -592,30 +905,31 @@
 
     if (is_compound) {
 #if CONFIG_EXT_REFS
-      const int bit_fwd = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
-                           mbmi->ref_frame[0] == LAST3_FRAME);
+      const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                       mbmi->ref_frame[0] == LAST3_FRAME);
       const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+#else  // CONFIG_EXT_REFS
+      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
 
-      // Write forward references.
-      aom_write(w, bit_fwd, av1_get_pred_prob_comp_fwdref_p(cm, xd));
-      if (!bit_fwd) {
-        const int bit1_fwd = mbmi->ref_frame[0] == LAST_FRAME;
-        aom_write(w, bit1_fwd, av1_get_pred_prob_comp_fwdref_p1(cm, xd));
+      aom_write(w, bit, av1_get_pred_prob_comp_ref_p(cm, xd));
+
+#if CONFIG_EXT_REFS
+      if (!bit) {
+        const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
+        aom_write(w, bit1, av1_get_pred_prob_comp_ref_p1(cm, xd));
       } else {
-        const int bit2_fwd = mbmi->ref_frame[0] == GOLDEN_FRAME;
-        aom_write(w, bit2_fwd, av1_get_pred_prob_comp_fwdref_p2(cm, xd));
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        aom_write(w, bit2, av1_get_pred_prob_comp_ref_p2(cm, xd));
       }
-      // Write forward references.
       aom_write(w, bit_bwd, av1_get_pred_prob_comp_bwdref_p(cm, xd));
-#else
-      aom_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
-                av1_get_pred_prob_comp_ref_p(cm, xd));
 #endif  // CONFIG_EXT_REFS
     } else {
 #if CONFIG_EXT_REFS
       const int bit0 = (mbmi->ref_frame[0] == ALTREF_FRAME ||
                         mbmi->ref_frame[0] == BWDREF_FRAME);
       aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
         aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
@@ -623,6 +937,7 @@
         const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
                           mbmi->ref_frame[0] == GOLDEN_FRAME);
         aom_write(w, bit2, av1_get_pred_prob_single_ref_p3(cm, xd));
+
         if (!bit2) {
           const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
           aom_write(w, bit3, av1_get_pred_prob_single_ref_p4(cm, xd));
@@ -631,9 +946,10 @@
           aom_write(w, bit4, av1_get_pred_prob_single_ref_p5(cm, xd));
         }
       }
-#else
+#else   // CONFIG_EXT_REFS
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
       aom_write(w, bit0, av1_get_pred_prob_single_ref_p1(cm, xd));
+
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
         aom_write(w, bit1, av1_get_pred_prob_single_ref_p2(cm, xd));
@@ -643,63 +959,120 @@
   }
 }
 
-#if CONFIG_EXT_INTRA || CONFIG_PALETTE
-static INLINE void write_uniform(aom_writer *w, int n, int v) {
-  const int l = get_unsigned_bits(n);
-  const int m = (1 << l) - n;
+#if CONFIG_FILTER_INTRA
+static void write_filter_intra_mode_info(const AV1_COMMON *const cm,
+                                         const MB_MODE_INFO *const mbmi,
+                                         aom_writer *w) {
+  if (mbmi->mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[0] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[0],
+              cm->fc->filter_intra_probs[0]);
+    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+      const FILTER_INTRA_MODE mode =
+          mbmi->filter_intra_mode_info.filter_intra_mode[0];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
+  }
 
-  if (l == 0) return;
-  if (v < m) {
-    aom_write_literal(w, v, l - 1);
-  } else {
-    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
-    aom_write_literal(w, (v - m) & 1, 1);
+  if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+      && mbmi->palette_mode_info.palette_size[1] == 0
+#endif  // CONFIG_PALETTE
+      ) {
+    aom_write(w, mbmi->filter_intra_mode_info.use_filter_intra_mode[1],
+              cm->fc->filter_intra_probs[1]);
+    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1]) {
+      const FILTER_INTRA_MODE mode =
+          mbmi->filter_intra_mode_info.filter_intra_mode[1];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
   }
 }
-#endif  // CONFIG_EXT_INTRA || CONFIG_PALETTE
+#endif  // CONFIG_FILTER_INTRA
 
 #if CONFIG_EXT_INTRA
-static void write_intra_angle_info(const MB_MODE_INFO *const mbmi,
+static void write_intra_angle_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                                    aom_writer *w) {
-  if (mbmi->sb_type < BLOCK_8X8) return;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+  int p_angle;
 
-  if (is_directional_mode(mbmi->mode)) {
-    const TX_SIZE max_tx_size = max_txsize_lookup[mbmi->sb_type];
-    const int max_angle_delta = av1_max_angle_delta_y[max_tx_size][mbmi->mode];
-    write_uniform(w, 2 * max_angle_delta + 1,
-                  max_angle_delta + mbmi->intra_angle_delta[0]);
+  if (bsize < BLOCK_8X8) return;
+
+  if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) {
+    write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                  MAX_ANGLE_DELTAS + mbmi->angle_delta[0]);
+    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+    if (av1_is_intra_filter_switchable(p_angle)) {
+      av1_write_token(w, av1_intra_filter_tree,
+                      cm->fc->intra_filter_probs[intra_filter_ctx],
+                      &intra_filter_encodings[mbmi->intra_filter]);
+    }
   }
 
-  if (is_directional_mode(mbmi->uv_mode)) {
-    write_uniform(w, 2 * MAX_ANGLE_DELTA_UV + 1,
-                  MAX_ANGLE_DELTA_UV + mbmi->intra_angle_delta[1]);
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+    write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                  MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
   }
 }
 #endif  // CONFIG_EXT_INTRA
 
-static void write_switchable_interp_filter(AV1_COMP *const cpi,
-                                           const MACROBLOCKD *const xd,
+static void write_switchable_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
                                            aom_writer *w) {
-  const AV1_COMMON *const cm = &cpi->common;
+  AV1_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_DUAL_FILTER
+  int dir;
+#endif
   if (cm->interp_filter == SWITCHABLE) {
-    int ctx;
 #if CONFIG_EXT_INTERP
-    if (!is_interp_needed(xd)) {
-      assert(mbmi->interp_filter == EIGHTTAP);
+#if CONFIG_DUAL_FILTER
+    if (!av1_is_interp_needed(xd)) {
+      assert(mbmi->interp_filter[0] == EIGHTTAP_REGULAR);
       return;
     }
-#endif
-    ctx = av1_get_pred_context_switchable_interp(xd);
-#if CONFIG_DAALA_EC
-    aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter],
-                     cm->fc->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS);
 #else
-    av1_write_token(w, av1_switchable_interp_tree,
-                    cm->fc->switchable_interp_prob[ctx],
-                    &switchable_interp_encodings[mbmi->interp_filter]);
+    if (!av1_is_interp_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+      assert(mbmi->interp_filter[0] == EIGHTTAP_REGULAR);
+      assert(mbmi->interp_filter[1] == EIGHTTAP_REGULAR);
+#else
+      assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
 #endif
-    ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+      return;
+    }
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+    for (dir = 0; dir < 2; ++dir) {
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+        av1_write_token(w, av1_switchable_interp_tree,
+                        cm->fc->switchable_interp_prob[ctx],
+                        &switchable_interp_encodings[mbmi->interp_filter[dir]]);
+        ++cpi->interp_filter_selected[0][mbmi->interp_filter[dir]];
+      }
+    }
+#else
+    {
+      const int ctx = av1_get_pred_context_switchable_interp(xd);
+#if CONFIG_DAALA_EC
+      aom_write_symbol(w, av1_switchable_interp_ind[mbmi->interp_filter],
+                       cm->fc->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS);
+#else
+      av1_write_token(w, av1_switchable_interp_tree,
+                      cm->fc->switchable_interp_prob[ctx],
+                      &switchable_interp_encodings[mbmi->interp_filter]);
+#endif
+      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+    }
+#endif
   }
 }
 
@@ -713,6 +1086,7 @@
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   int palette_ctx = 0;
   int n, i;
+
   if (mbmi->mode == DC_PRED) {
     n = pmi->palette_size[0];
     if (above_mi)
@@ -730,6 +1104,7 @@
       write_uniform(w, n, pmi->palette_first_color_idx[0]);
     }
   }
+
   if (mbmi->uv_mode == DC_PRED) {
     n = pmi->palette_size[1];
     aom_write(w, n > 0,
@@ -750,7 +1125,81 @@
 }
 #endif  // CONFIG_PALETTE
 
+static void write_tx_type(const AV1_COMMON *const cm,
+                          const MB_MODE_INFO *const mbmi,
+#if CONFIG_SUPERTX
+                          const int supertx_enabled,
+#endif
+                          aom_writer *w) {
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_VAR_TX
+  const TX_SIZE tx_size = is_inter ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+  const TX_SIZE tx_size = mbmi->tx_size;
+#endif
+  if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+    const BLOCK_SIZE bsize = mbmi->sb_type;
+    if (get_ext_tx_types(tx_size, bsize, is_inter) > 1 && cm->base_qindex > 0 &&
+        !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int eset = get_ext_tx_set(tx_size, bsize, is_inter);
+      if (is_inter) {
+        assert(ext_tx_used_inter[eset][mbmi->tx_type]);
+        if (eset > 0)
+          av1_write_token(
+              w, av1_ext_tx_inter_tree[eset],
+              cm->fc->inter_ext_tx_prob[eset][txsize_sqr_map[tx_size]],
+              &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+      } else if (ALLOW_INTRA_EXT_TX) {
+        if (eset > 0)
+          av1_write_token(w, av1_ext_tx_intra_tree[eset],
+                          cm->fc->intra_ext_tx_prob[eset][tx_size][mbmi->mode],
+                          &ext_tx_intra_encodings[eset][mbmi->tx_type]);
+      }
+    }
+#else
+    if (tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      if (is_inter) {
+#if CONFIG_DAALA_EC
+        aom_write_symbol(w, av1_ext_tx_ind[mbmi->tx_type],
+                         cm->fc->inter_ext_tx_cdf[tx_size], TX_TYPES);
+#else
+        av1_write_token(w, av1_ext_tx_tree, cm->fc->inter_ext_tx_prob[tx_size],
+                        &ext_tx_encodings[mbmi->tx_type]);
+#endif
+      } else {
+#if CONFIG_DAALA_EC
+        aom_write_symbol(
+            w, av1_ext_tx_ind[mbmi->tx_type],
+            cm->fc->intra_ext_tx_cdf[tx_size]
+                                    [intra_mode_to_tx_type_context[mbmi->mode]],
+            TX_TYPES);
+#else
+        av1_write_token(
+            w, av1_ext_tx_tree,
+            cm->fc
+                ->intra_ext_tx_prob[tx_size]
+                                   [intra_mode_to_tx_type_context[mbmi->mode]],
+            &ext_tx_encodings[mbmi->tx_type]);
+#endif
+      }
+    }
+#endif  // CONFIG_EXT_TX
+  }
+}
+
 static void pack_inter_mode_mvs(AV1_COMP *cpi, const MODE_INFO *mi,
+#if CONFIG_SUPERTX
+                                int supertx_enabled,
+#endif
                                 aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
 #if !CONFIG_REF_MV
@@ -761,8 +1210,8 @@
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 #else
-  const MACROBLOCK *const x = &cpi->td.mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MACROBLOCK *x = &cpi->td.mb;
+  const MACROBLOCKD *xd = &x->e_mbd;
 #endif
   const struct segmentation *const seg = &cm->seg;
   struct segmentation_probs *const segp = &cm->fc->seg;
@@ -787,8 +1236,14 @@
     }
   }
 
+#if CONFIG_SUPERTX
+  if (supertx_enabled)
+    skip = mbmi->skip;
+  else
+    skip = write_skip(cm, xd, segment_id, mi, w);
+#else
   skip = write_skip(cm, xd, segment_id, mi, w);
-
+#endif  // CONFIG_SUPERTX
 #if CONFIG_DELTA_Q
   if (cm->delta_q_present_flag) {
     int mi_row = (-xd->mb_to_top_edge) >> (MI_SIZE_LOG2 + 3);
@@ -804,12 +1259,54 @@
   }
 #endif
 
-  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      aom_write(w, is_inter, av1_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
       !(is_inter && skip) && !xd->lossless[segment_id]) {
+#if CONFIG_VAR_TX
+    if (is_inter) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+      const int bh = tx_size_high_unit[max_tx_size];
+      const int bw = tx_size_wide_unit[max_tx_size];
+      const int width = num_4x4_blocks_wide_lookup[bsize];
+      const int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      if (is_rect_tx_allowed(xd, mbmi)) {
+        int tx_size_cat = inter_tx_size_cat_lookup[bsize];
+
+        aom_write(w, is_rect_tx(mbmi->tx_size),
+                  cm->fc->rect_tx_prob[tx_size_cat]);
+      }
+
+      if (is_rect_tx(mbmi->tx_size)) {
+        set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
+      } else {
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+        for (idy = 0; idy < height; idy += bh)
+          for (idx = 0; idx < width; idx += bw)
+            write_tx_size_vartx(cm, xd, mbmi, max_tx_size, height != width, idy,
+                                idx, w);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+    } else {
+      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
+      write_selected_tx_size(cm, xd, w);
+    }
+  } else {
+    set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
+#else
     write_selected_tx_size(cm, xd, w);
+#endif
   }
 
   if (!is_inter) {
@@ -844,26 +1341,43 @@
     write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
 #endif
 #if CONFIG_EXT_INTRA
-    write_intra_angle_info(mbmi, w);
+    write_intra_angle_info(cm, xd, w);
 #endif  // CONFIG_EXT_INTRA
-
 #if CONFIG_PALETTE
     if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
       write_palette_mode_info(cm, xd, mi, w);
 #endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+    if (bsize >= BLOCK_8X8) write_filter_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_FILTER_INTRA
   } else {
     int16_t mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
     write_ref_frames(cm, xd, w);
 
 #if CONFIG_REF_MV
-    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                         mbmi->ref_frame, bsize, -1);
+#if CONFIG_EXT_INTER
+    if (is_compound)
+      mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+    else
+#endif  // CONFIG_EXT_INTER
+      mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                           mbmi->ref_frame, bsize, -1);
 #endif
 
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_inter_mode(cm, w, mode, mode_ctx);
+#if CONFIG_EXT_INTER
+        if (is_inter_compound_mode(mode))
+          write_inter_compound_mode(cm, w, mode, mode_ctx);
+        else if (is_inter_singleref_mode(mode))
+#endif  // CONFIG_EXT_INTER
+          write_inter_mode(cm, w, mode,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                           is_compound,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                           mode_ctx);
+
 #if CONFIG_REF_MV
         if (mode == NEARMV || mode == NEWMV)
           write_drl_idx(cm, mbmi, mbmi_ext, w);
@@ -871,9 +1385,9 @@
       }
     }
 
-#if !CONFIG_EXT_INTERP
+#if !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
     write_switchable_interp_filter(cpi, xd, w);
-#endif  // CONFIG_EXT_INTERP
+#endif  // !CONFIG_EXT_INTERP
 
     if (bsize < BLOCK_8X8) {
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -884,11 +1398,29 @@
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
 #if CONFIG_REF_MV
-          mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                               mbmi->ref_frame, bsize, j);
+#if CONFIG_EXT_INTER
+          if (!is_compound)
+#endif  // CONFIG_EXT_INTER
+            mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                                 mbmi->ref_frame, bsize, j);
 #endif
-          write_inter_mode(cm, w, b_mode, mode_ctx);
+#if CONFIG_EXT_INTER
+          if (is_inter_compound_mode(b_mode))
+            write_inter_compound_mode(cm, w, b_mode, mode_ctx);
+          else if (is_inter_singleref_mode(b_mode))
+#endif  // CONFIG_EXT_INTER
+            write_inter_mode(cm, w, b_mode,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                             has_second_ref(mbmi),
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                             mode_ctx);
+
+#if CONFIG_EXT_INTER
+          if (b_mode == NEWMV || b_mode == NEWFROMNEARMV ||
+              b_mode == NEW_NEWMV) {
+#else
           if (b_mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
             for (ref = 0; ref < 1 + is_compound; ++ref) {
 #if CONFIG_REF_MV
               int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
@@ -898,14 +1430,60 @@
               nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
               av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
+#if CONFIG_EXT_INTER
+                            &mi->bmi[j].ref_mv[ref].as_mv,
+#if CONFIG_REF_MV
+                            is_compound,
+#endif
+#else
+#if CONFIG_REF_MV
+                            &mi->bmi[j].pred_mv[ref].as_mv, is_compound,
+#else
                             &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+#endif  // CONFIG_REF_MV
+#endif  // CONFIG_EXT_INTER
                             nmvc, allow_hp);
             }
           }
+#if CONFIG_EXT_INTER
+          else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                                      mbmi_ext->ref_mv_stack[rf_type], 1,
+                                      mbmi->ref_mv_idx);
+            nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
+                          &mi->bmi[j].ref_mv[1].as_mv,
+#if CONFIG_REF_MV
+                          is_compound,
+#endif
+                          nmvc, allow_hp);
+          } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+            int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+            int nmv_ctx = av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                                      mbmi_ext->ref_mv_stack[rf_type], 0,
+                                      mbmi->ref_mv_idx);
+            nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+            av1_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
+                          &mi->bmi[j].ref_mv[0].as_mv,
+#if CONFIG_REF_MV
+                          is_compound,
+#endif
+                          nmvc, allow_hp);
+          }
+#endif  // CONFIG_EXT_INTER
         }
       }
     } else {
+#if CONFIG_EXT_INTER
+      if (mode == NEWMV || mode == NEWFROMNEARMV || mode == NEW_NEWMV) {
+#else
       if (mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
         int_mv ref_mv;
         for (ref = 0; ref < 1 + is_compound; ++ref) {
 #if CONFIG_REF_MV
@@ -916,48 +1494,126 @@
           nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
 #endif
           ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
-          av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
-                        allow_hp);
+#if CONFIG_EXT_INTER
+          if (mode == NEWFROMNEARMV)
+            av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
+                          &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][1].as_mv,
+#if CONFIG_REF_MV
+                          is_compound,
+#endif
+                          nmvc, allow_hp);
+          else
+#endif  // CONFIG_EXT_INTER
+            av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv,
+#if CONFIG_REF_MV
+                          is_compound,
+#endif
+                          nmvc, allow_hp);
+        }
+#if CONFIG_EXT_INTER
+      } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                        mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+        nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+        av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
+                      &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv,
+#if CONFIG_REF_MV
+                      is_compound,
+#endif
+                      nmvc, allow_hp);
+      } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+        int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+        int nmv_ctx =
+            av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                        mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+        nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+        av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
+                      &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv,
+#if CONFIG_REF_MV
+                      is_compound,
+#endif
+                      nmvc, allow_hp);
+#endif  // CONFIG_EXT_INTER
+      }
+    }
+
+#if CONFIG_EXT_INTER
+    if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        is_interintra_allowed(mbmi)) {
+      const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+      const int bsize_group = size_group_lookup[bsize];
+      aom_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
+      if (interintra) {
+        write_interintra_mode(w, mbmi->interintra_mode,
+                              cm->fc->interintra_mode_prob[bsize_group]);
+        if (is_interintra_wedge_used(bsize)) {
+          aom_write(w, mbmi->use_wedge_interintra,
+                    cm->fc->wedge_interintra_prob[bsize]);
+          if (mbmi->use_wedge_interintra) {
+            aom_write_literal(w, mbmi->interintra_wedge_index,
+                              get_wedge_bits_lookup(bsize));
+            assert(mbmi->interintra_wedge_sign == 0);
+          }
         }
       }
     }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_SUPERTX
+    if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+      if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+        if (is_motion_variation_allowed(mbmi)) {
+          // TODO(debargha): Might want to only emit this if SEG_LVL_SKIP
+          // is not active, and assume SIMPLE_TRANSLATION in the decoder if
+          // it is active.
+          assert(mbmi->motion_mode < MOTION_MODES);
+          av1_write_token(w, av1_motion_mode_tree,
+                          cm->fc->motion_mode_prob[bsize],
+                          &motion_mode_encodings[mbmi->motion_mode]);
+        }
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+    if (cpi->common.reference_mode != SINGLE_REFERENCE &&
+        is_inter_compound_mode(mbmi->mode) &&
 #if CONFIG_MOTION_VAR
-    write_motion_mode(cm, mbmi, w);
+        !(is_motion_variation_allowed(mbmi) &&
+          mbmi->motion_mode != SIMPLE_TRANSLATION) &&
 #endif  // CONFIG_MOTION_VAR
-#if CONFIG_EXT_INTERP
+        is_interinter_wedge_used(bsize)) {
+      aom_write(w, mbmi->use_wedge_interinter,
+                cm->fc->wedge_interinter_prob[bsize]);
+      if (mbmi->use_wedge_interinter) {
+        aom_write_literal(w, mbmi->interinter_wedge_index,
+                          get_wedge_bits_lookup(bsize));
+        aom_write_bit(w, mbmi->interinter_wedge_sign);
+      }
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTERP || CONFIG_DUAL_FILTER
     write_switchable_interp_filter(cpi, xd, w);
 #endif  // CONFIG_EXT_INTERP
   }
 
-  if (mbmi->tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    if (is_inter) {
-#if CONFIG_DAALA_EC
-      aom_write_symbol(w, av1_ext_tx_ind[mbmi->tx_type],
-                       cm->fc->inter_ext_tx_cdf[mbmi->tx_size], TX_TYPES);
-#else
-      av1_write_token(w, av1_ext_tx_tree,
-                      cm->fc->inter_ext_tx_prob[mbmi->tx_size],
-                      &ext_tx_encodings[mbmi->tx_type]);
+  write_tx_type(cm, mbmi,
+#if CONFIG_SUPERTX
+                supertx_enabled,
 #endif
-    } else {
-#if CONFIG_DAALA_EC
-      aom_write_symbol(
-          w, av1_ext_tx_ind[mbmi->tx_type],
-          cm->fc->intra_ext_tx_cdf[mbmi->tx_size]
-                                  [intra_mode_to_tx_type_context[mbmi->mode]],
-          TX_TYPES);
-#else
-      av1_write_token(
-          w, av1_ext_tx_tree,
-          cm->fc->intra_ext_tx_prob[mbmi->tx_size]
-                                   [intra_mode_to_tx_type_context[mbmi->mode]],
-          &ext_tx_encodings[mbmi->tx_type]);
-#endif
-    }
-  } else {
-    if (!mbmi->skip) assert(mbmi->tx_type == DCT_DCT);
-  }
+                w);
 }
 
 #if CONFIG_DELTA_Q
@@ -1033,31 +1689,46 @@
   write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
 #endif
 #if CONFIG_EXT_INTRA
-  write_intra_angle_info(mbmi, w);
+  write_intra_angle_info(cm, xd, w);
 #endif  // CONFIG_EXT_INTRA
-
 #if CONFIG_PALETTE
   if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
     write_palette_mode_info(cm, xd, mi, w);
 #endif  // CONFIG_PALETTE
+#if CONFIG_FILTER_INTRA
+  if (bsize >= BLOCK_8X8) write_filter_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_FILTER_INTRA
 
-  if (mbmi->tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-#if CONFIG_DAALA_EC
-    aom_write_symbol(
-        w, av1_ext_tx_ind[mbmi->tx_type],
-        cm->fc->intra_ext_tx_cdf[mbmi->tx_size]
-                                [intra_mode_to_tx_type_context[mbmi->mode]],
-        TX_TYPES);
-#else
-    av1_write_token(
-        w, av1_ext_tx_tree,
-        cm->fc->intra_ext_tx_prob[mbmi->tx_size]
-                                 [intra_mode_to_tx_type_context[mbmi->mode]],
-        &ext_tx_encodings[mbmi->tx_type]);
+  write_tx_type(cm, mbmi,
+#if CONFIG_SUPERTX
+                0,
 #endif
+                w);
+}
+
+#if CONFIG_SUPERTX
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+                              mi_row, mi_col)                              \
+  write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
+#else
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+                              mi_row, mi_col)                              \
+  write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_RD_DEBUG
+static void dump_mode_info(MODE_INFO *mi) {
+  printf("\nmi->mbmi.mi_row == %d\n", mi->mbmi.mi_row);
+  printf("&& mi->mbmi.mi_col == %d\n", mi->mbmi.mi_col);
+  printf("&& mi->mbmi.sb_type == %d\n", mi->mbmi.sb_type);
+  printf("&& mi->mbmi.tx_size == %d\n", mi->mbmi.tx_size);
+  if (mi->mbmi.sb_type >= BLOCK_8X8) {
+    printf("&& mi->mbmi.mode == %d\n", mi->mbmi.mode);
+  } else {
+    printf("&& mi->bmi[0].as_mode == %d\n", mi->bmi[0].as_mode);
   }
 }
+#endif
 
 #if CONFIG_PVQ
 PVQ_INFO *get_pvq_block(PVQ_QUEUE *pvq_q) {
@@ -1074,29 +1745,43 @@
 #endif
 
 static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
-                          aom_writer *w, TOKENEXTRA **tok,
-                          const TOKENEXTRA *const tok_end, int mi_row,
-                          int mi_col) {
+                          aom_writer *w, const TOKENEXTRA **tok,
+                          const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                          int supertx_enabled,
+#endif
+                          int mi_row, int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
   int plane;
+  int bh, bw;
+#if CONFIG_RD_DEBUG
+  int64_t txb_coeff_cost[MAX_MB_PLANE] = { 0 };
+#endif
+#if CONFIG_RANS
+  (void)tok;
+  (void)tok_end;
+  (void)plane;
+#endif  // !CONFIG_RANS
 #if CONFIG_PVQ
   MB_MODE_INFO *mbmi;
   BLOCK_SIZE bsize;
   od_adapt_ctx *adapt;
-
   (void)tok;
   (void)tok_end;
 #endif
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
 
+  assert(m->mbmi.sb_type <= cm->sb_size);
+
+  bh = num_8x8_blocks_high_lookup[m->mbmi.sb_type];
+  bw = num_8x8_blocks_wide_lookup[m->mbmi.sb_type];
+
   cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 
-  set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
-                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 #if CONFIG_PVQ
   mbmi = &m->mbmi;
   bsize = mbmi->sb_type;
@@ -1106,7 +1791,46 @@
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cm, xd, xd->mi, w);
   } else {
-    pack_inter_mode_mvs(cpi, m, w);
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context =
+        xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if CONFIG_EXT_INTERP
+    // av1_is_interp_needed needs the ref frame buffers set up to look
+    // up if they are scaled. av1_is_interp_needed is in turn needed by
+    // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+    set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+#endif  // CONFIG_EXT_INTERP
+#if 0
+    // NOTE(zoeliu): For debug
+    if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+      const PREDICTION_MODE mode = m->mbmi.mode;
+      const int segment_id = m->mbmi.segment_id;
+      const BLOCK_SIZE bsize = m->mbmi.sb_type;
+
+      // For sub8x8, simply dump out the first sub8x8 block info
+      const PREDICTION_MODE b_mode =
+          (bsize < BLOCK_8X8) ? m->bmi[0].as_mode : -1;
+      const int mv_x = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.row : m->mbmi.mv[0].as_mv.row;
+      const int mv_y = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.col : m->mbmi.mv[0].as_mv.col;
+
+      printf("Before pack_inter_mode_mvs(): "
+             "Frame=%d, (mi_row,mi_col)=(%d,%d), "
+             "mode=%d, segment_id=%d, bsize=%d, b_mode=%d, "
+             "mv[0]=(%d, %d), ref[0]=%d, ref[1]=%d\n",
+             cm->current_video_frame, mi_row, mi_col,
+             mode, segment_id, bsize, b_mode, mv_x, mv_y,
+             m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+    }
+#endif  // 0
+    pack_inter_mode_mvs(cpi, m,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        w);
   }
 
 #if CONFIG_PALETTE
@@ -1119,22 +1843,96 @@
       assert(*tok < tok_end);
       pack_palette_tokens(w, tok, m->mbmi.palette_mode_info.palette_size[plane],
                           rows * cols - 1);
-      assert(*tok < tok_end);
+      assert(*tok < tok_end + m->mbmi.skip);
     }
   }
 #endif  // CONFIG_PALETTE
-
 #if !CONFIG_PVQ
+#if CONFIG_SUPERTX
+  if (supertx_enabled) return;
+#endif  // CONFIG_SUPERTX
+
   if (!m->mbmi.skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      MB_MODE_INFO *mbmi = &m->mbmi;
+      BLOCK_SIZE bsize = mbmi->sb_type;
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(AOMMAX(bsize, BLOCK_8X8), pd);
+
+      const int num_4x4_w =
+          block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+      const int num_4x4_h =
+          block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+      int row, col;
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      TX_SIZE tx_size =
+          plane ? get_uv_tx_size(mbmi, &xd->plane[plane]) : mbmi->tx_size;
+#endif
+
+      TOKEN_STATS token_stats;
+      token_stats.cost = 0;
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+
+      if (is_inter_block(mbmi) && !is_rect_tx(tx_size))
+#else
+      if (is_inter_block(mbmi))
+#endif
+      {
+        const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+        int block = 0;
+        const int step =
+            tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+        const int bkw = tx_size_wide_unit[max_tx_size];
+        const int bkh = tx_size_high_unit[max_tx_size];
+        for (row = 0; row < num_4x4_h; row += bkh) {
+          for (col = 0; col < num_4x4_w; col += bkw) {
+            pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                            cm->bit_depth, block, row, col, max_tx_size,
+                            &token_stats);
+            block += step;
+          }
+        }
+      } else {
+        TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+                           : m->mbmi.tx_size;
+        const int bkw = tx_size_wide_unit[tx];
+        const int bkh = tx_size_high_unit[tx];
+
+        for (row = 0; row < num_4x4_h; row += bkh)
+          for (col = 0; col < num_4x4_w; col += bkw)
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+      }
+#else
       TX_SIZE tx =
           plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane]) : m->mbmi.tx_size;
-      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      TOKEN_STATS token_stats;
+      token_stats.cost = 0;
+      pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_RD_DEBUG
+      txb_coeff_cost[plane] += token_stats.cost;
+#else
+      (void)token_stats;
+#endif
+
       assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
       (*tok)++;
     }
   }
+
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    if (m->mbmi.rd_stats.txb_coeff_cost[plane] != txb_coeff_cost[plane]) {
+      dump_mode_info(m);
+      assert(0);
+    }
+  }
+#endif  // CONFIG_RD_DEBUG
 #else
   // PVQ writes its tokens (i.e. symbols) here.
   if (!m->mbmi.skip) {
@@ -1245,11 +2043,19 @@
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (has_rows && has_cols) {
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+      av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
+    else
+      av1_write_token(w, av1_ext_partition_tree, probs,
+                      &ext_partition_encodings[p]);
+#else
 #if CONFIG_DAALA_EC
     aom_write_symbol(w, p, cm->fc->partition_cdf[ctx], PARTITION_TYPES);
 #else
     av1_write_token(w, av1_partition_tree, probs, &partition_encodings[p]);
 #endif
+#endif  // CONFIG_EXT_PARTITION_TYPES
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
     aom_write(w, p == PARTITION_SPLIT, probs[1]);
@@ -1261,60 +2067,187 @@
   }
 }
 
-static void write_modes_sb(AV1_COMP *cpi, const TileInfo *const tile,
-                           aom_writer *w, TOKENEXTRA **tok,
-                           const TOKENEXTRA *const tok_end, int mi_row,
-                           int mi_col, BLOCK_SIZE bsize) {
+#if CONFIG_SUPERTX
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,   \
+                               mi_row, mi_col, bsize)                         \
+  write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
+                 bsize)
+#else
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, \
+                               mi_row, mi_col, bsize)                       \
+  write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
+#endif  // CONFIG_SUPERTX
+
+static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
+                           aom_writer *const w, const TOKENEXTRA **tok,
+                           const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                           int supertx_enabled,
+#endif
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
-  const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  const MODE_INFO *m = NULL;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *mbmi;
+  const int pack_token = !supertx_enabled;
+  TX_SIZE supertx_size;
+  int plane;
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
-
-  partition = partition_lookup[bsl][m->mbmi.sb_type];
-  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
-  subsize = get_subsize(bsize, partition);
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+#if CONFIG_SUPERTX
+  mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize], mi_col,
+                 num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, cm->mi_cols);
+  if (!supertx_enabled && !frame_is_intra_only(cm) &&
+      partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !xd->lossless[0]) {
+    aom_prob prob;
+    supertx_size = max_txsize_lookup[bsize];
+    prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                               [supertx_size];
+    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+    aom_write(w, supertx_enabled, prob);
+  }
+#endif  // CONFIG_SUPERTX
   if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row,
+                          mi_col);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
         break;
       case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        if (mi_row + hbs < cm->mi_rows)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                mi_row + hbs, mi_col);
         break;
       case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        if (mi_col + hbs < cm->mi_cols)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                                mi_row, mi_col + hbs);
         break;
       case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
-                       subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col + hbs, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col + hbs, subsize);
         break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        break;
+      case PARTITION_HORZ_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col + hbs);
+        break;
+      case PARTITION_VERT_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        break;
+      case PARTITION_VERT_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row + hbs, mi_col + hbs);
+        break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
       default: assert(0);
     }
   }
+#if CONFIG_SUPERTX
+  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
+    int skip;
+    xd->mi = cm->mi_grid_visible + mi_offset;
+    supertx_size = mbmi->tx_size;
+    set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize], mi_col,
+                   num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, cm->mi_cols);
 
-  // update partition context
+    assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0));
+    assert(mbmi->segment_id_supertx < MAX_SEGMENTS);
+
+    skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w);
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(supertx_size, bsize, 1) > 1 && !skip) {
+      int eset = get_ext_tx_set(supertx_size, bsize, 1);
+      if (eset > 0) {
+        av1_write_token(w, av1_ext_tx_inter_tree[eset],
+                        cm->fc->inter_ext_tx_prob[eset][supertx_size],
+                        &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+      }
+    }
+#else
+    if (supertx_size < TX_32X32 && !skip) {
+      av1_write_token(w, av1_ext_tx_tree,
+                      cm->fc->inter_ext_tx_prob[supertx_size],
+                      &ext_tx_encodings[mbmi->tx_type]);
+    }
+#endif  // CONFIG_EXT_TX
+
+    if (!skip) {
+      assert(*tok < tok_end);
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi_txb_size];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi_txb_size];
+        int row, col;
+        TX_SIZE tx =
+            plane ? get_uv_tx_size(mbmi, &xd->plane[plane]) : mbmi->tx_size;
+        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
+        int bw = num_4x4_blocks_wide_lookup[txb_size];
+
+        TOKEN_STATS token_stats;
+        token_stats.cost = 0;
+        for (row = 0; row < num_4x4_h; row += bw)
+          for (col = 0; col < num_4x4_w; col += bw)
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx, &token_stats);
+        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+        (*tok)++;
+      }
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
+// update partition context
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 #if CONFIG_DERING
   if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
@@ -1358,12 +2291,17 @@
 #endif
 }
 
-static void write_modes(AV1_COMP *cpi, const TileInfo *const tile,
-                        aom_writer *w, TOKENEXTRA **tok,
+static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
+                        aom_writer *const w, const TOKENEXTRA **tok,
                         const TOKENEXTRA *const tok_end) {
+  AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int mi_row_start = tile->mi_row_start;
+  const int mi_row_end = tile->mi_row_end;
+  const int mi_col_start = tile->mi_col_start;
+  const int mi_col_end = tile->mi_col_end;
   int mi_row, mi_col;
-
+  av1_zero_above_context(cm, mi_col_start, mi_col_end);
 #if CONFIG_PVQ
   assert(cpi->td.mb.pvq_q->curr_pos == 0);
 #endif
@@ -1373,12 +2311,13 @@
   }
 #endif
 
-  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += MAX_MIB_SIZE) {
-    av1_zero(xd->left_seg_context);
-    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MAX_MIB_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
+    av1_zero_left_context(xd);
+
+    for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
+      write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0, mi_row, mi_col,
+                             cm->sb_size);
+    }
   }
 #if CONFIG_PVQ
   // Check that the number of PVQ blocks encoded and written to the bitstream
@@ -1470,11 +2409,10 @@
         }
       }
 
-      // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
         aom_write_bit(bc, 0);
-        break;
+        return;
       }
       aom_write_bit(bc, 1);
       for (i = 0; i < PLANE_TYPES; ++i) {
@@ -1507,7 +2445,7 @@
           }
         }
       }
-      break;
+      return;
     }
 
     case ONE_LOOP_REDUCED: {
@@ -1560,34 +2498,434 @@
       if (updates == 0) {
         aom_write_bit(bc, 0);  // no updates
       }
-      break;
+      return;
     }
     default: assert(0);
   }
 }
 
+#if CONFIG_ENTROPY
+// Calculate the token counts between subsequent subframe updates.
+static void get_coef_counts_diff(AV1_COMP *cpi, int index,
+                                 av1_coeff_count coef_counts[TX_SIZES]
+                                                            [PLANE_TYPES],
+                                 unsigned int eob_counts[TX_SIZES][PLANE_TYPES]
+                                                        [REF_TYPES][COEF_BANDS]
+                                                        [COEFF_CONTEXTS]) {
+  int i, j, k, l, m, tx_size, val;
+  const int max_idx = cpi->common.coef_probs_update_idx;
+  const TX_MODE tx_mode = cpi->common.tx_mode;
+  const int max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  const SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+  assert(max_idx < COEF_PROBS_BUFS);
+
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            if (index == max_idx) {
+              val =
+                  cpi->common.counts.eob_branch[tx_size][i][j][k][l] -
+                  subframe_stats->eob_counts_buf[max_idx][tx_size][i][j][k][l];
+            } else {
+              val = subframe_stats->eob_counts_buf[index + 1][tx_size][i][j][k]
+                                                  [l] -
+                    subframe_stats->eob_counts_buf[index][tx_size][i][j][k][l];
+            }
+            assert(val >= 0);
+            eob_counts[tx_size][i][j][k][l] = val;
+
+            for (m = 0; m < ENTROPY_TOKENS; ++m) {
+              if (index == max_idx) {
+                val = cpi->td.rd_counts.coef_counts[tx_size][i][j][k][l][m] -
+                      subframe_stats->coef_counts_buf[max_idx][tx_size][i][j][k]
+                                                     [l][m];
+              } else {
+                val = subframe_stats->coef_counts_buf[index + 1][tx_size][i][j]
+                                                     [k][l][m] -
+                      subframe_stats->coef_counts_buf[index][tx_size][i][j][k]
+                                                     [l][m];
+              }
+              assert(val >= 0);
+              coef_counts[tx_size][i][j][k][l][m] = val;
+            }
+          }
+}
+
+static void update_coef_probs_subframe(
+    aom_writer *const bc, AV1_COMP *cpi, TX_SIZE tx_size,
+    av1_coeff_stats branch_ct[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES],
+    av1_coeff_probs_model *new_coef_probs) {
+  av1_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+  const aom_prob upd = DIFF_UPDATE_PROB;
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+  const int max_idx = cpi->common.coef_probs_update_idx;
+  int idx;
+  unsigned int this_branch_ct[ENTROPY_NODES][COEF_PROBS_BUFS][2];
+
+  switch (cpi->sf.use_fast_coef_updates) {
+    case TWO_LOOP: {
+      /* dry run to see if there is any update at all needed */
+      int savings = 0;
+      int update[2] = { 0, 0 };
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                const aom_prob oldp = old_coef_probs[i][j][k][l][t];
+                int s, u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = av1_prob_update_search_model_subframe(
+                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+                      stepsize, max_idx);
+                else
+                  s = av1_prob_update_search_subframe(this_branch_ct[t], oldp,
+                                                      &newp, upd, max_idx);
+                if (s > 0 && newp != oldp) u = 1;
+                if (u)
+                  savings += s - (int)(av1_cost_zero(upd));
+                else
+                  savings -= (int)(av1_cost_zero(upd));
+                update[u]++;
+              }
+            }
+          }
+        }
+      }
+
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        aom_write_bit(bc, 0);
+        return;
+      }
+      aom_write_bit(bc, 1);
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = av1_prob_update_search_model_subframe(
+                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+                      stepsize, max_idx);
+                else
+                  s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
+                                                      &newp, upd, max_idx);
+                if (s > 0 && newp != *oldp) u = 1;
+                aom_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  av1_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      return;
+    }
+
+    case ONE_LOOP_REDUCED: {
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                aom_prob newp = new_coef_probs[i][j][k][l][t];
+                aom_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = av1_prob_update_search_model_subframe(
+                      this_branch_ct, old_coef_probs[i][j][k][l], &newp, upd,
+                      stepsize, max_idx);
+                else
+                  s = av1_prob_update_search_subframe(this_branch_ct[t], *oldp,
+                                                      &newp, upd, max_idx);
+                if (s > 0 && newp != *oldp) u = 1;
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  aom_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    aom_write(bc, 0, upd);
+                }
+                aom_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  av1_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      if (updates == 0) {
+        aom_write_bit(bc, 0);  // no updates
+      }
+      return;
+    }
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_ENTROPY
+
 static void update_coef_probs(AV1_COMP *cpi, aom_writer *w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
+#if CONFIG_RANS
+  int update = 0;
+#endif  // CONFIG_RANS
+#if CONFIG_ENTROPY
+  AV1_COMMON *cm = &cpi->common;
+  SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+  int i;
+  av1_coeff_probs_model dummy_frame_coef_probs[PLANE_TYPES];
+
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    av1_copy(cpi->common.fc->coef_probs,
+             subframe_stats->enc_starting_coef_probs);
+    for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+      get_coef_counts_diff(cpi, i, cpi->wholeframe_stats.coef_counts_buf[i],
+                           cpi->wholeframe_stats.eob_counts_buf[i]);
+    }
+  }
+#endif  // CONFIG_ENTROPY
+
   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
     av1_coeff_stats frame_branch_ct[PLANE_TYPES];
     av1_coeff_probs_model frame_coef_probs[PLANE_TYPES];
-    if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+    if (cpi->td.counts->tx_size_totals[tx_size] <= 20 ||
         (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
       aom_write_bit(w, 0);
     } else {
-      build_tree_distribution(cpi, tx_size, frame_branch_ct, frame_coef_probs);
-      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
-                               frame_coef_probs);
+#if CONFIG_ENTROPY
+      if (cm->do_subframe_update &&
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+        unsigned int this_eob_counts_copy[PLANE_TYPES][REF_TYPES][COEF_BANDS]
+                                         [COEFF_CONTEXTS];
+        av1_coeff_count coef_counts_copy[PLANE_TYPES];
+        av1_copy(this_eob_counts_copy, cpi->common.counts.eob_branch[tx_size]);
+        av1_copy(coef_counts_copy, cpi->td.rd_counts.coef_counts[tx_size]);
+        build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                                frame_coef_probs);
+        for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+          av1_copy(cpi->common.counts.eob_branch[tx_size],
+                   cpi->wholeframe_stats.eob_counts_buf[i][tx_size]);
+          av1_copy(cpi->td.rd_counts.coef_counts[tx_size],
+                   cpi->wholeframe_stats.coef_counts_buf[i][tx_size]);
+          build_tree_distribution(cpi, tx_size, cpi->branch_ct_buf[i][tx_size],
+                                  dummy_frame_coef_probs);
+        }
+        av1_copy(cpi->common.counts.eob_branch[tx_size], this_eob_counts_copy);
+        av1_copy(cpi->td.rd_counts.coef_counts[tx_size], coef_counts_copy);
+
+        update_coef_probs_subframe(w, cpi, tx_size, cpi->branch_ct_buf,
+                                   frame_coef_probs);
+#if CONFIG_RANS
+        update = 1;
+#endif  // CONFIG_RANS
+      } else {
+#endif  // CONFIG_ENTROPY
+        build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                                frame_coef_probs);
+        update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                                 frame_coef_probs);
+#if CONFIG_RANS
+        update = 1;
+#endif  // CONFIG_RANS
+#if CONFIG_ENTROPY
+      }
+#endif  // CONFIG_ENTROPY
     }
   }
+
+#if CONFIG_ENTROPY
+  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  av1_copy(subframe_stats->coef_probs_buf[0], cm->fc->coef_probs);
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    unsigned int eob_counts_copy[TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS]
+                                [COEFF_CONTEXTS];
+    av1_copy(eob_counts_copy, cm->counts.eob_branch);
+    for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
+      for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+        av1_full_to_model_counts(cm->counts.coef[tx_size],
+                                 subframe_stats->coef_counts_buf[i][tx_size]);
+      av1_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
+      av1_partial_adapt_probs(cm, 0, 0);
+      av1_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
+    }
+    av1_copy(cm->fc->coef_probs, subframe_stats->coef_probs_buf[0]);
+    av1_copy(cm->counts.eob_branch, eob_counts_copy);
+  }
+#endif  // CONFIG_ENTROPY
+#if CONFIG_RANS
+  if (update) av1_coef_pareto_cdfs(cpi->common.fc);
+#endif  // CONFIG_RANS
 }
 #endif
 
-static void encode_loopfilter(struct loopfilter *lf,
-                              struct aom_write_bit_buffer *wb) {
+#if CONFIG_LOOP_RESTORATION
+static void encode_restoration_mode(AV1_COMMON *cm,
+                                    struct aom_write_bit_buffer *wb) {
+  RestorationInfo *rst = &cm->rst_info;
+  switch (rst->frame_restoration_type) {
+    case RESTORE_NONE:
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, 0);
+      break;
+    case RESTORE_SWITCHABLE:
+      aom_wb_write_bit(wb, 0);
+      aom_wb_write_bit(wb, 1);
+      break;
+    case RESTORE_BILATERAL:
+      aom_wb_write_bit(wb, 1);
+      aom_wb_write_bit(wb, 0);
+      break;
+    case RESTORE_WIENER:
+      aom_wb_write_bit(wb, 1);
+      aom_wb_write_bit(wb, 1);
+      break;
+    default: assert(0);
+  }
+}
+
+static void encode_restoration(AV1_COMMON *cm, aom_writer *wb) {
   int i;
+  RestorationInfo *rsi = &cm->rst_info;
+  if (rsi->frame_restoration_type != RESTORE_NONE) {
+    if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+      // RESTORE_SWITCHABLE
+      for (i = 0; i < cm->rst_internal.ntiles; ++i) {
+        av1_write_token(
+            wb, av1_switchable_restore_tree, cm->fc->switchable_restore_prob,
+            &switchable_restore_encodings[rsi->restoration_type[i]]);
+        if (rsi->restoration_type[i] == RESTORE_BILATERAL) {
+          int s;
+          for (s = 0; s < BILATERAL_SUBTILES; ++s) {
+#if BILATERAL_SUBTILES == 0
+            aom_write_literal(wb, rsi->bilateral_info[i].level[s],
+                              av1_bilateral_level_bits(cm));
+#else
+            aom_write(wb, rsi->bilateral_info[i].level[s] >= 0,
+                      RESTORE_NONE_BILATERAL_PROB);
+            if (rsi->bilateral_info[i].level[s] >= 0) {
+              aom_write_literal(wb, rsi->bilateral_info[i].level[s],
+                                av1_bilateral_level_bits(cm));
+            }
+#endif
+          }
+        } else if (rsi->restoration_type[i] == RESTORE_WIENER) {
+          aom_write_literal(
+              wb, rsi->wiener_info[i].vfilter[0] - WIENER_FILT_TAP0_MINV,
+              WIENER_FILT_TAP0_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].vfilter[1] - WIENER_FILT_TAP1_MINV,
+              WIENER_FILT_TAP1_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].vfilter[2] - WIENER_FILT_TAP2_MINV,
+              WIENER_FILT_TAP2_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].hfilter[0] - WIENER_FILT_TAP0_MINV,
+              WIENER_FILT_TAP0_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].hfilter[1] - WIENER_FILT_TAP1_MINV,
+              WIENER_FILT_TAP1_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].hfilter[2] - WIENER_FILT_TAP2_MINV,
+              WIENER_FILT_TAP2_BITS);
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_BILATERAL) {
+      for (i = 0; i < cm->rst_internal.ntiles; ++i) {
+        int s;
+        for (s = 0; s < BILATERAL_SUBTILES; ++s) {
+          aom_write(wb, rsi->bilateral_info[i].level[s] >= 0,
+                    RESTORE_NONE_BILATERAL_PROB);
+          if (rsi->bilateral_info[i].level[s] >= 0) {
+            aom_write_literal(wb, rsi->bilateral_info[i].level[s],
+                              av1_bilateral_level_bits(cm));
+          }
+        }
+      }
+    } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+      for (i = 0; i < cm->rst_internal.ntiles; ++i) {
+        aom_write(wb, rsi->wiener_info[i].level != 0, RESTORE_NONE_WIENER_PROB);
+        if (rsi->wiener_info[i].level) {
+          aom_write_literal(
+              wb, rsi->wiener_info[i].vfilter[0] - WIENER_FILT_TAP0_MINV,
+              WIENER_FILT_TAP0_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].vfilter[1] - WIENER_FILT_TAP1_MINV,
+              WIENER_FILT_TAP1_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].vfilter[2] - WIENER_FILT_TAP2_MINV,
+              WIENER_FILT_TAP2_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].hfilter[0] - WIENER_FILT_TAP0_MINV,
+              WIENER_FILT_TAP0_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].hfilter[1] - WIENER_FILT_TAP1_MINV,
+              WIENER_FILT_TAP1_BITS);
+          aom_write_literal(
+              wb, rsi->wiener_info[i].hfilter[2] - WIENER_FILT_TAP2_MINV,
+              WIENER_FILT_TAP2_BITS);
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
+static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+  int i;
+  struct loopfilter *lf = &cm->lf;
 
   // Encode the loop filter level and type
   aom_wb_write_literal(wb, lf->filter_level, 6);
@@ -1600,7 +2938,7 @@
   if (lf->mode_ref_delta_enabled) {
     aom_wb_write_bit(wb, lf->mode_ref_delta_update);
     if (lf->mode_ref_delta_update) {
-      for (i = 0; i < MAX_REF_FRAMES; i++) {
+      for (i = 0; i < TOTAL_REFS_PER_FRAME; i++) {
         const int delta = lf->ref_deltas[i];
         const int changed = delta != lf->last_ref_deltas[i];
         aom_wb_write_bit(wb, changed);
@@ -1667,7 +3005,6 @@
 static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
                                 struct aom_write_bit_buffer *wb) {
   int i, j;
-
   const struct segmentation *seg = &cm->seg;
 
   aom_wb_write_bit(wb, seg->enabled);
@@ -1757,30 +3094,10 @@
 #endif
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
-    unsigned int ct_8x8p[TX_SIZES - 3][2];
-    unsigned int ct_16x16p[TX_SIZES - 2][2];
-    unsigned int ct_32x32p[TX_SIZES - 1][2];
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      av1_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
-      for (j = TX_4X4; j < TX_SIZES - 3; j++)
-        av1_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j],
-                                  probwt);
-    }
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      av1_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
-      for (j = TX_4X4; j < TX_SIZES - 2; j++)
-        av1_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
-                                  ct_16x16p[j], probwt);
-    }
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      av1_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
-      for (j = TX_4X4; j < TX_SIZES - 1; j++)
-        av1_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
-                                  ct_32x32p[j], probwt);
-    }
+    for (i = 0; i < MAX_TX_DEPTH; ++i)
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        prob_diff_update(av1_tx_size_tree[i], cm->fc->tx_size_probs[i][j],
+                         counts->tx_size[i][j], i + 2, probwt, w);
   }
 }
 
@@ -1816,6 +3133,33 @@
 
 static void write_tile_info(const AV1_COMMON *const cm,
                             struct aom_write_bit_buffer *wb) {
+#if CONFIG_EXT_TILE
+  const int tile_width =
+      ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >>
+      cm->mib_size_log2;
+  const int tile_height =
+      ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >>
+      cm->mib_size_log2;
+
+  assert(tile_width > 0);
+  assert(tile_height > 0);
+
+// Write the tile sizes
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    assert(tile_width <= 32);
+    assert(tile_height <= 32);
+    aom_wb_write_literal(wb, tile_width - 1, 5);
+    aom_wb_write_literal(wb, tile_height - 1, 5);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    assert(tile_width <= 64);
+    assert(tile_height <= 64);
+    aom_wb_write_literal(wb, tile_width - 1, 6);
+    aom_wb_write_literal(wb, tile_height - 1, 6);
+  }
+#else
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
@@ -1828,13 +3172,14 @@
   // rows
   aom_wb_write_bit(wb, cm->log2_tile_rows != 0);
   if (cm->log2_tile_rows != 0) aom_wb_write_bit(wb, cm->log2_tile_rows != 1);
+#endif  // CONFIG_EXT_TILE
 }
 
 static int get_refresh_mask(AV1_COMP *cpi) {
   int refresh_mask = 0;
 
 #if CONFIG_EXT_REFS
-  // NOTE: When LAST_FRAME is to get refreshed, the decoder will be
+  // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
   // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
   // the 3 LAST reference frames will be updated accordingly, i.e.:
   // (1) The original virtual index for LAST3_FRAME will become the new virtual
@@ -1844,7 +3189,12 @@
   //     LAST3_FRAME.
   refresh_mask |=
       (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]);
-  refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
+  if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+    // We have swapped the virtual indices
+    refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->arf_map[0]);
+  } else {
+    refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
+  }
 #else
   refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
 #endif  // CONFIG_EXT_REFS
@@ -1863,55 +3213,201 @@
     return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
   } else {
     int arf_idx = cpi->alt_fb_idx;
+#if CONFIG_EXT_REFS
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    arf_idx = cpi->arf_map[gf_group->arf_update_idx[gf_group->index]];
+#else
     if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
       arf_idx = gf_group->arf_update_idx[gf_group->index];
     }
+#endif  // CONFIG_EXT_REFS
     return refresh_mask | (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
            (cpi->refresh_alt_ref_frame << arf_idx);
   }
 }
 
+#if CONFIG_EXT_TILE
+static INLINE int find_identical_tile(
+    const int tile_row, const int tile_col,
+    TileBufferEnc (*const tile_buffers)[1024]) {
+  const MV32 candidate_offset[1] = { { 1, 0 } };
+  const uint8_t *const cur_tile_data =
+      tile_buffers[tile_row][tile_col].data + 4;
+  const unsigned int cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+  int i;
+
+  if (tile_row == 0) return 0;
+
+  // (TODO: yunqingwang) For now, only above tile is checked and used.
+  // More candidates such as left tile can be added later.
+  for (i = 0; i < 1; i++) {
+    int row_offset = candidate_offset[0].row;
+    int col_offset = candidate_offset[0].col;
+    int row = tile_row - row_offset;
+    int col = tile_col - col_offset;
+    uint8_t tile_hdr;
+    const uint8_t *tile_data;
+    TileBufferEnc *candidate;
+
+    if (row < 0 || col < 0) continue;
+
+    tile_hdr = *(tile_buffers[row][col].data);
+
+    // Read out tcm bit
+    if ((tile_hdr >> 7) == 1) {
+      // The candidate is a copy tile itself
+      row_offset += tile_hdr & 0x7f;
+      row = tile_row - row_offset;
+    }
+
+    candidate = &tile_buffers[row][col];
+
+    if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+    tile_data = candidate->data + 4;
+
+    if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+    // Identical tile found
+    assert(row_offset > 0);
+    return row_offset;
+  }
+
+  // No identical tile found
+  return 0;
+}
+#endif  // CONFIG_EXT_TILE
+
 #if CONFIG_TILE_GROUPS
-static size_t encode_tiles(AV1_COMP *cpi, struct aom_write_bit_buffer *wb,
-                           unsigned int *max_tile_sz)
+static uint32_t write_tiles(AV1_COMP *const cpi,
+                            struct aom_write_bit_buffer *wb,
+                            unsigned int *max_tile_size,
+                            unsigned int *max_tile_col_size) {
 #else
-static size_t encode_tiles(AV1_COMP *cpi, uint8_t *data_ptr,
-                           unsigned int *max_tile_sz)
+static uint32_t write_tiles(AV1_COMP *const cpi, uint8_t *const dst,
+                            unsigned int *max_tile_size,
+                            unsigned int *max_tile_col_size) {
 #endif
-{
-  AV1_COMMON *const cm = &cpi->common;
+  const AV1_COMMON *const cm = &cpi->common;
 #if CONFIG_ANS
-  struct AnsCoder ans;
-  struct BufAnsCoder *buf_ans = &cpi->buf_ans;
+  struct AnsCoder token_ans;
 #else
-  aom_writer residual_bc;
+  aom_writer mode_bc;
 #endif  // CONFIG_ANS
   int tile_row, tile_col;
-  TOKENEXTRA *tok_end;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  unsigned int max_tile = 0;
+  TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
+  TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+  size_t total_size = 0;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
 #if CONFIG_TILE_GROUPS
   const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
   const int have_tiles = n_log2_tiles > 0;
-
   size_t comp_hdr_size;
   // Fixed size tile groups for the moment
   const int num_tg_hdrs = cm->num_tg;
   const int tg_size = (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
   int tile_count = 0;
   int uncompressed_hdr_size = 0;
-  uint8_t *data_ptr = NULL;
+  uint8_t *dst = NULL;
   struct aom_write_bit_buffer comp_hdr_len_wb;
   struct aom_write_bit_buffer tg_params_wb;
   int saved_offset;
 #endif
-  size_t total_size = 0;
+#if CONFIG_EXT_TILE
+  const int have_tiles = tile_cols * tile_rows > 1;
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_ANS
+  struct BufAnsCoder *buf_ans = &cpi->buf_ans;
+#endif  // CONFIG_ANS
 
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
+  *max_tile_size = 0;
+  *max_tile_col_size = 0;
 
+// All tile size fields are output on 4 bytes. A call to remux_tiles will
+// later compact the data if smaller headers are adequate.
+
+#if CONFIG_EXT_TILE
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileInfo tile_info;
+    const int is_last_col = (tile_col == tile_cols - 1);
+    const size_t col_offset = total_size;
+
+    av1_tile_set_col(&tile_info, cm, tile_col);
+
+    // The last column does not have a column header
+    if (!is_last_col) total_size += 4;
+
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      unsigned int tile_size;
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+      const int data_offset = have_tiles ? 4 : 0;
+
+      av1_tile_set_row(&tile_info, cm, tile_row);
+
+      buf->data = dst + total_size;
+
+      // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+      // even for the last one, unless no tiling is used at all.
+      total_size += data_offset;
+#if !CONFIG_ANS
+      aom_start_encode(&mode_bc, buf->data + data_offset);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+      assert(tok == tok_end);
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+#else
+      buf_ans_write_reset(buf_ans);
+      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+      assert(tok == tok_end);
+      ans_write_init(&token_ans, buf->data + data_offset);
+      buf_ans_flush(buf_ans, &token_ans);
+      tile_size = ans_write_end(&token_ans);
+#endif  // !CONFIG_ANS
+
+      buf->size = tile_size;
+
+      // Record the maximum tile size we see, so we can compact headers later.
+      *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+      if (have_tiles) {
+        // tile header: size of this tile, or copy offset
+        uint32_t tile_header = tile_size;
+
+        // Check if this tile is a copy tile.
+        // Very low chances to have copy tiles on the key frames, so don't
+        // search on key frames to reduce unnecessary search.
+        if (cm->frame_type != KEY_FRAME) {
+          const int idendical_tile_offset =
+              find_identical_tile(tile_row, tile_col, tile_buffers);
+
+          if (idendical_tile_offset > 0) {
+            tile_size = 0;
+            tile_header = idendical_tile_offset | 0x80;
+            tile_header <<= 24;
+          }
+        }
+
+        mem_put_le32(buf->data, tile_header);
+      }
+
+      total_size += tile_size;
+    }
+
+    if (!is_last_col) {
+      size_t col_size = total_size - col_offset - 4;
+      mem_put_le32(dst + col_offset, col_size);
+
+      // If it is not final packing, record the maximum tile column size we see,
+      // otherwise, check if the tile size is out of the range.
+      *max_tile_col_size = AOMMAX(*max_tile_col_size, col_size);
+    }
+  }
+#else
 #if CONFIG_TILE_GROUPS
   write_uncompressed_header(cpi, wb);
 
@@ -1928,30 +3424,37 @@
   aom_wb_write_literal(wb, 0, 16);
 
   uncompressed_hdr_size = aom_wb_bytes_written(wb);
-  data_ptr = wb->bit_buffer;
-  comp_hdr_size =
-      write_compressed_header(cpi, data_ptr + uncompressed_hdr_size);
+  dst = wb->bit_buffer;
+  comp_hdr_size = write_compressed_header(cpi, dst + uncompressed_hdr_size);
   aom_wb_write_literal(&comp_hdr_len_wb, (int)(comp_hdr_size), 16);
   total_size += uncompressed_hdr_size + comp_hdr_size;
 #endif
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    const int is_last_row = (tile_row == tile_rows - 1);
+
+    av1_tile_set_row(&tile_info, cm, tile_row);
+
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       const int tile_idx = tile_row * tile_cols + tile_col;
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      const int is_last_col = (tile_col == tile_cols - 1);
       unsigned int tile_size;
 #if CONFIG_PVQ
       TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
 #endif
-      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
 #if !CONFIG_TILE_GROUPS
-      const int is_last_tile = tile_idx == tile_rows * tile_cols - 1;
+      const int is_last_tile = is_last_col && is_last_row;
+      (void)tile_idx;
 #else
       // All tiles in a tile group have a length
       const int is_last_tile = 0;
       if (tile_count >= tg_size) {
         // Copy uncompressed header
-        memcpy(data_ptr + total_size, data_ptr,
-               uncompressed_hdr_size * sizeof(uint8_t));
+        memcpy(dst + total_size, dst, uncompressed_hdr_size * sizeof(uint8_t));
         // Write the number of tiles in the group into the last uncompressed
         // header
         aom_wb_write_literal(&tg_params_wb, tile_idx - tile_count,
@@ -1959,52 +3462,55 @@
         aom_wb_write_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
         tg_params_wb.bit_offset = saved_offset + 8 * total_size;
         // Copy compressed header
-        memcpy(data_ptr + total_size + uncompressed_hdr_size,
-               data_ptr + uncompressed_hdr_size,
-               comp_hdr_size * sizeof(uint8_t));
+        memcpy(dst + total_size + uncompressed_hdr_size,
+               dst + uncompressed_hdr_size, comp_hdr_size * sizeof(uint8_t));
         total_size += uncompressed_hdr_size;
         total_size += comp_hdr_size;
         tile_count = 0;
       }
       tile_count++;
 #endif
+      av1_tile_set_col(&tile_info, cm, tile_col);
 
-      tok_end = cpi->tile_tok[tile_row][tile_col] +
-                cpi->tok_count[tile_row][tile_col];
+      buf->data = dst + total_size;
+
+      // The last tile does not have a header.
+      if (!is_last_tile) total_size += 4;
 
 #if CONFIG_ANS
       buf_ans_write_reset(buf_ans);
-      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, buf_ans, &tok,
-                  tok_end);
+      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
       assert(tok == tok_end);
-      ans_write_init(&ans, data_ptr + total_size + 4 * !is_last_tile);
-      buf_ans_flush(buf_ans, &ans);
-      tile_size = ans_write_end(&ans) - 1;
+      ans_write_init(&token_ans, dst + total_size);
+      buf_ans_flush(buf_ans, &token_ans);
+      tile_size = ans_write_end(&token_ans);
 #else
-      aom_start_encode(&residual_bc, data_ptr + total_size + 4 * !is_last_tile);
-
+      aom_start_encode(&mode_bc, dst + total_size);
 #if CONFIG_PVQ
       // NOTE: This will not work with CONFIG_ANS turned on.
       od_adapt_ctx_reset(&cpi->td.mb.daala_enc.state.adapt, 0);
       cpi->td.mb.pvq_q = &this_tile->pvq_q;
 #endif
-      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &residual_bc, &tok,
-                  tok_end);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
       assert(tok == tok_end);
-      aom_stop_encode(&residual_bc);
-      tile_size = residual_bc.pos - 1;
+      aom_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
 #endif
 #if CONFIG_PVQ
       cpi->td.mb.pvq_q = NULL;
-#endif
+#endif  // !CONFIG_ANS
+
       assert(tile_size > 0);
+
+      buf->size = tile_size;
+
       if (!is_last_tile) {
+        *max_tile_size = AOMMAX(*max_tile_size, tile_size);
         // size of this tile
-        mem_put_le32(data_ptr + total_size, tile_size);
-        max_tile = max_tile > tile_size ? max_tile : tile_size;
-        total_size += 4;
+        mem_put_le32(buf->data, tile_size);
       }
-      total_size += tile_size + 1;
+
+      total_size += tile_size;
     }
   }
 #if CONFIG_TILE_GROUPS
@@ -2015,9 +3521,8 @@
     aom_wb_write_literal(&tg_params_wb, tile_count - 1, n_log2_tiles);
   }
 #endif
-  *max_tile_sz = max_tile;
-
-  return total_size;
+#endif  // CONFIG_EXT_TILE
+  return (uint32_t)total_size;
 }
 
 static void write_render_size(const AV1_COMMON *cm,
@@ -2122,7 +3627,6 @@
   cm->is_reference_frame = 1;
 
   if (cm->show_existing_frame) {
-    MV_REFERENCE_FRAME ref_frame;
     RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
     const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
 
@@ -2136,16 +3640,6 @@
     aom_wb_write_bit(wb, 1);  // show_existing_frame
     aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
 
-    cpi->refresh_frame_mask = get_refresh_mask(cpi);
-    aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
-
-    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-      assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
-      aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
-                           REF_FRAMES_LOG2);
-      aom_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
-    }
-
     return;
   } else {
 #endif                        // CONFIG_EXT_REFS
@@ -2239,22 +3733,31 @@
   }
 
   if (!cm->error_resilient_mode) {
-    aom_wb_write_bit(wb,
-                     cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF);
-    if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
-      aom_wb_write_bit(
-          wb, cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_BACKWARD);
+    aom_wb_write_bit(
+        wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
   }
 
   aom_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
-  encode_loopfilter(&cm->lf, wb);
+  assert(cm->mib_size == num_8x8_blocks_wide_lookup[cm->sb_size]);
+  assert(cm->mib_size == 1 << cm->mib_size_log2);
+#if CONFIG_EXT_PARTITION
+  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
+  aom_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
+#else
+  assert(cm->sb_size == BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
+  encode_loopfilter(cm, wb);
 #if CONFIG_DERING
   encode_dering(cm->dering_level, wb);
 #endif  // CONFIG_DERING
 #if CONFIG_CLPF
   encode_clpf(cm, wb);
 #endif
+#if CONFIG_LOOP_RESTORATION
+  encode_restoration_mode(cm, wb);
+#endif  // CONFIG_LOOP_RESTORATION
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
 #if CONFIG_DELTA_Q
@@ -2279,9 +3782,10 @@
 #endif
 
   if (!cm->seg.enabled && xd->lossless[0])
-    cm->tx_mode = TX_4X4;
+    cm->tx_mode = ONLY_4X4;
   else
     write_txfm_mode(cm->tx_mode, wb);
+
   if (cpi->allow_comp_inter_inter) {
     const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
     const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
@@ -2293,8 +3797,70 @@
   write_tile_info(cm, wb);
 }
 
-static size_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
+#if CONFIG_GLOBAL_MOTION
+static void write_global_motion_params(Global_Motion_Params *params,
+                                       aom_prob *probs, aom_writer *w) {
+  GLOBAL_MOTION_TYPE gmtype = get_gmtype(params);
+  av1_write_token(w, av1_global_motion_types_tree, probs,
+                  &global_motion_types_encodings[gmtype]);
+  switch (gmtype) {
+    case GLOBAL_ZERO: break;
+    case GLOBAL_AFFINE:
+      aom_write_primitive_symmetric(
+          w, (params->motion_params.wmmat[4] >> GM_ALPHA_PREC_DIFF),
+          GM_ABS_ALPHA_BITS);
+      aom_write_primitive_symmetric(
+          w, (params->motion_params.wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+                 (1 << GM_ALPHA_PREC_BITS),
+          GM_ABS_ALPHA_BITS);
+    // fallthrough intended
+    case GLOBAL_ROTZOOM:
+      aom_write_primitive_symmetric(
+          w, (params->motion_params.wmmat[2] >> GM_ALPHA_PREC_DIFF),
+          GM_ABS_ALPHA_BITS);
+      aom_write_primitive_symmetric(
+          w, (params->motion_params.wmmat[3] >> GM_ALPHA_PREC_DIFF) -
+                 (1 << GM_ALPHA_PREC_BITS),
+          GM_ABS_ALPHA_BITS);
+    // fallthrough intended
+    case GLOBAL_TRANSLATION:
+      aom_write_primitive_symmetric(
+          w, (params->motion_params.wmmat[0] >> GM_TRANS_PREC_DIFF),
+          GM_ABS_TRANS_BITS);
+      aom_write_primitive_symmetric(
+          w, (params->motion_params.wmmat[1] >> GM_TRANS_PREC_DIFF),
+          GM_ABS_TRANS_BITS);
+      break;
+    default: assert(0);
+  }
+}
+
+static void write_global_motion(AV1_COMP *cpi, aom_writer *w) {
   AV1_COMMON *const cm = &cpi->common;
+  int frame;
+  for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+    if (!cpi->global_motion_used[frame]) {
+      memset(&cm->global_motion[frame], 0, sizeof(*cm->global_motion));
+    }
+    write_global_motion_params(&cm->global_motion[frame],
+                               cm->fc->global_motion_types_prob, w);
+    /*
+          printf("Enc Ref %d [%d] (used %d): %d %d %d %d\n",
+                 frame, cm->current_video_frame, cpi->global_motion_used[frame],
+                 cm->global_motion[frame].motion_params.wmmat[0].as_mv.row,
+                 cm->global_motion[frame].motion_params.wmmat[0].as_mv.col,
+                 cm->global_motion[frame].motion_params.wmmat[1].as_mv.row,
+                 cm->global_motion[frame].motion_params.wmmat[1].as_mv.col);
+    */
+  }
+}
+#endif
+
+static uint32_t write_compressed_header(AV1_COMP *cpi, uint8_t *data) {
+  AV1_COMMON *const cm = &cpi->common;
+#if CONFIG_SUPERTX
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+#endif  // CONFIG_SUPERTX
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = cpi->td.counts;
   aom_writer *header_bc;
@@ -2317,11 +3883,23 @@
   aom_start_encode(header_bc, data);
 #endif
 
+#if CONFIG_LOOP_RESTORATION
+  encode_restoration(cm, header_bc);
+#endif  // CONFIG_LOOP_RESTORATION
   update_txfm_probs(cm, header_bc, counts);
-
 #if !CONFIG_PVQ
   update_coef_probs(cpi, header_bc);
 #endif
+#if CONFIG_VAR_TX
+  update_txfm_partition_probs(cm, header_bc, counts, probwt);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 1; i < TX_SIZES - 1; ++i)
+      av1_cond_prob_diff_update(header_bc, &fc->rect_tx_prob[i],
+                                counts->rect_tx[i], probwt);
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+#endif
 
   update_skip_probs(cm, header_bc, counts);
 #if CONFIG_DELTA_Q
@@ -2335,12 +3913,26 @@
                      counts->uv_mode[i], INTRA_MODES, probwt, header_bc);
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  prob_diff_update(av1_partition_tree, fc->partition_prob[0],
+                   counts->partition[0], PARTITION_TYPES, probwt, header_bc);
+  for (i = 1; i < PARTITION_CONTEXTS; ++i)
+    prob_diff_update(av1_ext_partition_tree, fc->partition_prob[i],
+                     counts->partition[i], EXT_PARTITION_TYPES, probwt,
+                     header_bc);
+#else
   for (i = 0; i < PARTITION_CONTEXTS; ++i) {
     prob_diff_update(av1_partition_tree, fc->partition_prob[i],
                      counts->partition[i], PARTITION_TYPES, probwt, header_bc);
   }
-#endif
+#endif  // CONFIG_EC_ADAPT, CONFIG_DAALA_EC
 
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    prob_diff_update(av1_intra_filter_tree, fc->intra_filter_probs[i],
+                     counts->intra_filter[i], INTRA_FILTERS, probwt, header_bc);
+#endif  // CONFIG_EXT_INTRA
+#endif  // CONFIG_EC_ADAPT, CONFIG_DAALA_EC
   if (frame_is_intra_only(cm)) {
     av1_copy(cm->kf_y_prob, av1_kf_y_mode_prob);
 #if CONFIG_DAALA_EC
@@ -2365,13 +3957,40 @@
     }
 #endif
 #endif
-#if CONFIG_MOTION_VAR
-    for (i = 0; i < BLOCK_SIZES; ++i)
-      if (is_motion_variation_allowed_bsize(i))
-        prob_diff_update(av1_motion_mode_tree, cm->fc->motion_mode_prob[i],
-                         counts->motion_mode[i], MOTION_MODES, probwt,
-                         header_bc);
-#endif  // CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+    update_inter_compound_mode_probs(cm, probwt, header_bc);
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
+          av1_cond_prob_diff_update(header_bc, &fc->interintra_prob[i],
+                                    cm->counts.interintra[i], probwt);
+        }
+      }
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        prob_diff_update(
+            av1_interintra_mode_tree, cm->fc->interintra_mode_prob[i],
+            counts->interintra_mode[i], INTERINTRA_MODES, probwt, header_bc);
+      }
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
+          av1_cond_prob_diff_update(header_bc, &fc->wedge_interintra_prob[i],
+                                    cm->counts.wedge_interintra[i], probwt);
+      }
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++)
+        if (is_interinter_wedge_used(i))
+          av1_cond_prob_diff_update(header_bc, &fc->wedge_interinter_prob[i],
+                                    cm->counts.wedge_interinter[i], probwt);
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
+      prob_diff_update(av1_motion_mode_tree, fc->motion_mode_prob[i],
+                       counts->motion_mode[i], MOTION_MODES, probwt, header_bc);
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 #if !CONFIG_EC_ADAPT
     if (cm->interp_filter == SWITCHABLE)
       update_switchable_interp_probs(cm, header_bc, counts);
@@ -2389,26 +4008,33 @@
                                     counts->comp_inter[i], probwt);
     }
 
-    if (cm->reference_mode != COMPOUND_REFERENCE)
-      for (i = 0; i < REF_CONTEXTS; i++)
-        for (j = 0; j < (SINGLE_REFS - 1); j++)
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+        for (j = 0; j < (SINGLE_REFS - 1); j++) {
           av1_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
                                     counts->single_ref[i][j], probwt);
-    if (cm->reference_mode != SINGLE_REFERENCE)
-#if CONFIG_EXT_REFS
+        }
+      }
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
-        for (j = 0; j < (FWD_REFS - 1); j++)
-          av1_cond_prob_diff_update(header_bc, &fc->comp_fwdref_prob[i][j],
-                                    counts->comp_fwdref[i][j], probwt);
-        for (j = 0; j < (BWD_REFS - 1); j++)
+#if CONFIG_EXT_REFS
+        for (j = 0; j < (FWD_REFS - 1); j++) {
+          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+                                    counts->comp_ref[i][j], probwt);
+        }
+        for (j = 0; j < (BWD_REFS - 1); j++) {
           av1_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
                                     counts->comp_bwdref[i][j], probwt);
-      }
+        }
 #else
-      for (i = 0; i < REF_CONTEXTS; i++)
-        av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i],
-                                  counts->comp_ref[i], probwt);
+        for (j = 0; j < (COMP_REFS - 1); j++) {
+          av1_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+                                    counts->comp_ref[i][j], probwt);
+        }
 #endif  // CONFIG_EXT_REFS
+      }
+    }
 
 #if !CONFIG_EC_ADAPT
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
@@ -2423,9 +4049,19 @@
 #else
                         &counts->mv);
 #endif
+#if CONFIG_DAALA_EC
+    av1_tree_to_cdf(av1_mv_joint_tree, cm->fc->nmvc.joints,
+                    cm->fc->nmvc.joint_cdf);
+#endif
 #if !CONFIG_EC_ADAPT
     update_ext_tx_probs(cm, header_bc);
 #endif
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0]) update_supertx_probs(cm, probwt, header_bc);
+#endif  // CONFIG_SUPERTX
+#if CONFIG_GLOBAL_MOTION
+    write_global_motion(cpi, header_bc);
+#endif  // CONFIG_GLOBAL_MOTION
   }
 #if CONFIG_EC_MULTISYMBOL
   av1_coef_pareto_cdfs(fc);
@@ -2434,7 +4070,6 @@
   av1_set_mode_cdfs(cm);
 #endif
 #endif
-
 #if CONFIG_ANS
   ans_write_init(&header_ans, data);
   buf_ans_flush(header_bc, &header_ans);
@@ -2448,59 +4083,159 @@
 #endif  // CONFIG_ANS
 }
 
-static int remux_tiles(uint8_t *dest, const int sz, const int n_tiles,
-                       const int mag) {
-  int rpos = 0, wpos = 0, n;
+#if !CONFIG_TILE_GROUPS
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+  // Choose the number of bytes required to represent size, without
+  // using the 'spare_msbs' number of most significant bits.
 
-  for (n = 0; n < n_tiles; n++) {
-    int tile_sz;
+  // Make sure we will fit in 4 bytes to start with..
+  if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
 
-    if (n == n_tiles - 1) {
-      tile_sz = sz - rpos;
-    } else {
-      tile_sz = mem_get_le32(&dest[rpos]) + 1;
-      rpos += 4;
-      switch (mag) {
-        case 0: dest[wpos] = tile_sz - 1; break;
-        case 1: mem_put_le16(&dest[wpos], tile_sz - 1); break;
-        case 2: mem_put_le24(&dest[wpos], tile_sz - 1); break;
-        case 3:  // remuxing should only happen if mag < 3
-        default: assert("Invalid value for tile size magnitude" && 0);
-      }
-      wpos += mag + 1;
-    }
+  // Normalise to 32 bits
+  size <<= spare_msbs;
 
-    memmove(&dest[wpos], &dest[rpos], tile_sz);
-    wpos += tile_sz;
-    rpos += tile_sz;
-  }
-
-  assert(rpos > wpos);
-  assert(rpos == sz);
-
-  return wpos;
+  if (size >> 24 != 0)
+    return 4;
+  else if (size >> 16 != 0)
+    return 3;
+  else if (size >> 8 != 0)
+    return 2;
+  else
+    return 1;
 }
 
-void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size) {
-  uint8_t *data = dest;
+static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
+  switch (sz) {
+    case 1: dst[0] = (uint8_t)(val & 0xff); break;
+    case 2: mem_put_le16(dst, val); break;
+    case 3: mem_put_le24(dst, val); break;
+    case 4: mem_put_le32(dst, val); break;
+    default: assert("Invalid size" && 0); break;
+  }
+}
+static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+                       const uint32_t data_size, const uint32_t max_tile_size,
+                       const uint32_t max_tile_col_size,
+                       int *const tile_size_bytes,
+                       int *const tile_col_size_bytes) {
+// Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+#if CONFIG_EXT_TILE
+  // The top bit in the tile size field indicates tile copy mode, so we
+  // have 1 less bit to code the tile size
+  const int tsb = choose_size_bytes(max_tile_size, 1);
+  const int tcsb = choose_size_bytes(max_tile_col_size, 0);
+#else
+  const int tsb = choose_size_bytes(max_tile_size, 0);
+  const int tcsb = 4;  // This is ignored
+  (void)max_tile_col_size;
+#endif  // CONFIG_EXT_TILE
+
+  assert(tsb > 0);
+  assert(tcsb > 0);
+
+  *tile_size_bytes = tsb;
+  *tile_col_size_bytes = tcsb;
+
+  if (tsb == 4 && tcsb == 4) {
+    return data_size;
+  } else {
+    uint32_t wpos = 0;
+    uint32_t rpos = 0;
+
+#if CONFIG_EXT_TILE
+    int tile_row;
+    int tile_col;
+
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < cm->tile_cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
+      }
+
+      for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+        } else {
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
+      }
+    }
+#else
+    const int n_tiles = cm->tile_cols * cm->tile_rows;
+    int n;
+
+    for (n = 0; n < n_tiles; n++) {
+      int tile_size;
+
+      if (n == n_tiles - 1) {
+        tile_size = data_size - rpos;
+      } else {
+        tile_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+        mem_put_varsize(dst + wpos, tsb, tile_size);
+        wpos += tsb;
+      }
+
+      memmove(dst + wpos, dst + rpos, tile_size);
+
+      rpos += tile_size;
+      wpos += tile_size;
+    }
+#endif  // CONFIG_EXT_TILE
+
+    assert(rpos > wpos);
+    assert(rpos == data_size);
+
+    return wpos;
+  }
+}
+#endif  // CONFIG_TILE_GROUPS
+
+void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+  uint8_t *data = dst;
 #if !CONFIG_TILE_GROUPS
-  size_t first_part_size = 0;
+  uint32_t compressed_header_size;
+  uint32_t uncompressed_header_size;
   struct aom_write_bit_buffer saved_wb;
 #endif
-  size_t data_sz;
+  uint32_t data_size;
   struct aom_write_bit_buffer wb = { data, 0 };
-  unsigned int max_tile;
-  AV1_COMMON *const cm = &cpi->common;
+
+  unsigned int max_tile_size;
+  unsigned int max_tile_col_size;
 
 #if !CONFIG_TILE_GROUPS
-  size_t uncompressed_hdr_size;
-  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
-  const int have_tiles = n_log2_tiles > 0;
+  int tile_size_bytes;
+  int tile_col_size_bytes;
+  AV1_COMMON *const cm = &cpi->common;
+  const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
 
 #if CONFIG_BITSTREAM_DEBUG
   bitstream_queue_reset_write();
 #endif
 
+  // Write the uncompressed header
   write_uncompressed_header(cpi, &wb);
 
 #if CONFIG_EXT_REFS
@@ -2510,45 +4245,59 @@
   }
 #endif  // CONFIG_EXT_REFS
 
+  // We do not know these in advance. Output placeholder bit.
   saved_wb = wb;
-  // don't know in advance first part. size
-  aom_wb_write_literal(&wb, 0, 16 + have_tiles * 2);
+  // Write tile size magnitudes
+  if (have_tiles) {
+// Note that the last item in the uncompressed header is the data
+// describing tile configuration.
+#if CONFIG_EXT_TILE
+    // Number of bytes in tile column size - 1
+    aom_wb_write_literal(&wb, 0, 2);
+#endif  // CONFIG_EXT_TILE
+    // Number of bytes in tile size - 1
+    aom_wb_write_literal(&wb, 0, 2);
+  }
+  // Size of compressed header
+  aom_wb_write_literal(&wb, 0, 16);
 
-  uncompressed_hdr_size = aom_wb_bytes_written(&wb);
-  data += uncompressed_hdr_size;
+  uncompressed_header_size = (uint32_t)aom_wb_bytes_written(&wb);
+  data += uncompressed_header_size;
 
   aom_clear_system_state();
-  first_part_size = write_compressed_header(cpi, data);
-  data += first_part_size;
-  data_sz = encode_tiles(cpi, data, &max_tile);
+
+  // Write the compressed header
+  compressed_header_size = write_compressed_header(cpi, data);
+  data += compressed_header_size;
+
+  // Write the encoded tile data
+  data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
 #else
-  data_sz = encode_tiles(cpi, &wb, &max_tile);
+  data_size = write_tiles(cpi, &wb, &max_tile_size, &max_tile_col_size);
 #endif
 #if !CONFIG_TILE_GROUPS
-  /* A global size of tile lengths in a frame does not fit with tile
-     groups, as we may want to transmit a tile group as soon as encoded,
-     rather than buffering the frame.
-     */
-  if (max_tile > 0) {
-    int mag;
-    unsigned int mask;
-
-    // Choose the (tile size) magnitude
-    for (mag = 0, mask = 0xff; mag < 4; mag++) {
-      if (max_tile <= mask) break;
-      mask <<= 8;
-      mask |= 0xff;
-    }
-    assert(n_log2_tiles > 0);
-    aom_wb_write_literal(&saved_wb, mag, 2);
-    if (mag < 3)
-      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
-  } else {
-    assert(n_log2_tiles == 0);
+  if (have_tiles) {
+    data_size =
+        remux_tiles(cm, data, data_size, max_tile_size, max_tile_col_size,
+                    &tile_size_bytes, &tile_col_size_bytes);
   }
-  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
-  aom_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+
+  data += data_size;
+
+  // Now fill in the gaps in the uncompressed header.
+  if (have_tiles) {
+#if CONFIG_EXT_TILE
+    assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+    aom_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
+#endif  // CONFIG_EXT_TILE
+    assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+    aom_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
+  }
+  // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
+  assert(compressed_header_size <= 0xffff);
+  aom_wb_write_literal(&saved_wb, compressed_header_size, 16);
+#else
+  data += data_size;
 #endif
-  data += data_sz;
-  *size = data - dest;
+  *size = data - dst;
 }

diff --git a/av1/encoder/bitstream.h b/av1/encoder/bitstream.h
index a9bb97a..89a014c 100644
--- a/av1/encoder/bitstream.h
+++ b/av1/encoder/bitstream.h

@@ -18,12 +18,19 @@
 
 #include "av1/encoder/encoder.h"
 
-void av1_encode_token_init();
 void av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
 
+void av1_encode_token_init(void);
+
 static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
+#if CONFIG_EXT_REFS
+  // Do not swap gf and arf indices for internal overlay frames
+  return !cpi->multi_arf_allowed && cpi->rc.is_src_frame_alt_ref &&
+         !cpi->rc.is_src_frame_ext_arf;
+#else
   return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
          cpi->rc.is_src_frame_alt_ref;
+#endif  // CONFIG_EXT_REFS
 }
 
 #ifdef __cplusplus

diff --git a/av1/encoder/block.h b/av1/encoder/block.h
index 211ae58..aa04389 100644
--- a/av1/encoder/block.h
+++ b/av1/encoder/block.h

@@ -37,8 +37,8 @@
   unsigned int var;
 } DIFF;
 
-struct macroblock_plane {
-  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+typedef struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
 #if CONFIG_PVQ
   DECLARE_ALIGNED(16, int16_t, src_int16[MAX_SB_SQUARE]);
 #endif
@@ -54,9 +54,12 @@
   const int16_t *quant_shift;
   const int16_t *zbin;
   const int16_t *round;
+#if CONFIG_NEW_QUANT
+  const cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
 
   int64_t quant_thred[2];
-};
+} MACROBLOCK_PLANE;
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
  * coefficient in this block was zero) or not. */
@@ -69,13 +72,16 @@
 #if CONFIG_REF_MV
   uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
   CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+#if CONFIG_EXT_INTER
+  int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+#endif  // CONFIG_EXT_INTER
 #endif
 } MB_MODE_INFO_EXT;
 
 #if CONFIG_PALETTE
 typedef struct {
-  uint8_t best_palette_color_map[4096];
-  float kmeans_data_buf[2 * 4096];
+  uint8_t best_palette_color_map[MAX_SB_SQUARE];
+  float kmeans_data_buf[2 * MAX_SB_SQUARE];
 } PALETTE_BUFFER;
 #endif  // CONFIG_PALETTE
 
@@ -86,7 +92,7 @@
   MACROBLOCKD e_mbd;
   MB_MODE_INFO_EXT *mbmi_ext;
   int skip_block;
-  int q_index;
+  int qindex;
 
   // The equivalent error at the current rdmult of one whole bit (not one
   // bitcost unit).
@@ -103,16 +109,21 @@
   int *m_search_count_ptr;
   int *ex_search_count_ptr;
 
+#if CONFIG_VAR_TX
+  unsigned int txb_split_count;
+#endif
+
   // These are set to their default values at the beginning, and then adjusted
   // further in the encoding process.
   BLOCK_SIZE min_partition_size;
   BLOCK_SIZE max_partition_size;
 
-  int mv_best_ref_index[MAX_REF_FRAMES];
-  unsigned int max_mv_context[MAX_REF_FRAMES];
+  int mv_best_ref_index[TOTAL_REFS_PER_FRAME];
+  unsigned int max_mv_context[TOTAL_REFS_PER_FRAME];
   unsigned int source_variance;
-  unsigned int pred_sse[MAX_REF_FRAMES];
-  int pred_mv_sad[MAX_REF_FRAMES];
+  unsigned int recon_variance;
+  unsigned int pred_sse[TOTAL_REFS_PER_FRAME];
+  int pred_mv_sad[TOTAL_REFS_PER_FRAME];
 
 #if CONFIG_REF_MV
   int *nmvjointcost;
@@ -127,8 +138,8 @@
   int *nmvcost_hp[2];
   int nmvjointsadcost[MV_JOINTS];
 #endif
-  int **mvcost;
 
+  int **mvcost;
   int *nmvsadcost[2];
   int *nmvsadcost_hp[2];
   int **mvsadcost;
@@ -148,6 +159,13 @@
   int mv_row_min;
   int mv_row_max;
 
+#if CONFIG_VAR_TX
+  uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#if CONFIG_REF_MV
+  uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#endif
+#endif
+
   int skip;
 
   // note that token_costs is the cost when eob node is skipped
@@ -158,11 +176,18 @@
   // indicate if it is in the rd search loop or encoding process
   int use_lp32x32fdct;
 
-  // use fast quantization process
-  int quant_fp;
-
   // Used to store sub partition's choices.
-  MV pred_mv[MAX_REF_FRAMES];
+  MV pred_mv[TOTAL_REFS_PER_FRAME];
+
+  // Store the best motion vector during motion search
+  int_mv best_mv;
+  // Store the second best motion vector during full-pixel motion search
+  int_mv second_best_mv;
+
+  // use default transform and skip transform type search for intra modes
+  int use_default_intra_tx_type;
+  // use default transform and skip transform type search for inter modes
+  int use_default_inter_tx_type;
 #if CONFIG_PVQ
   int rate;
   // 1 if neither AC nor DC is coded. Only used during RDO.

diff --git a/av1/encoder/context_tree.c b/av1/encoder/context_tree.c
index b7b5cbe..e1db4be 100644
--- a/av1/encoder/context_tree.c
+++ b/av1/encoder/context_tree.c

@@ -12,30 +12,42 @@
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encoder.h"
 
-static const BLOCK_SIZE square[] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 2] = {
+  BLOCK_8X8,     BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static void alloc_mode_context(AV1_COMMON *cm, int num_4x4_blk,
+#if CONFIG_EXT_PARTITION_TYPES
+                               PARTITION_TYPE partition,
+#endif
                                PICK_MODE_CONTEXT *ctx) {
   const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
   const int num_pix = num_blk << 4;
   int i;
   ctx->num_4x4_blk = num_blk;
+#if CONFIG_EXT_PARTITION_TYPES
+  ctx->partition = partition;
+#endif
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    CHECK_MEM_ERROR(cm, ctx->blk_skip[i], aom_calloc(num_blk, sizeof(uint8_t)));
+#endif
     CHECK_MEM_ERROR(cm, ctx->coeff[i],
                     aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
     CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
                     aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i])));
     CHECK_MEM_ERROR(cm, ctx->dqcoeff[i],
                     aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
+    CHECK_MEM_ERROR(cm, ctx->eobs[i],
+                    aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
 #if CONFIG_PVQ
     CHECK_MEM_ERROR(cm, ctx->pvq_ref_coeff[i],
                     aom_memalign(32, num_pix * sizeof(*ctx->pvq_ref_coeff[i])));
 #endif
-    CHECK_MEM_ERROR(cm, ctx->eobs[i],
-                    aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
   }
 
 #if CONFIG_PALETTE
@@ -52,6 +64,10 @@
 static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
   int i;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    aom_free(ctx->blk_skip[i]);
+    ctx->blk_skip[i] = 0;
+#endif
     aom_free(ctx->coeff[i]);
     ctx->coeff[i] = 0;
     aom_free(ctx->qcoeff[i]);
@@ -65,19 +81,71 @@
     aom_free(ctx->eobs[i]);
     ctx->eobs[i] = 0;
   }
+
 #if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) {
     aom_free(ctx->color_index_map[i]);
-    ctx->color_index_map[i] = NULL;
+    ctx->color_index_map[i] = 0;
   }
 #endif  // CONFIG_PALETTE
 }
 
 static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree,
                                 int num_4x4_blk) {
+#if CONFIG_EXT_PARTITION_TYPES
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_NONE, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[0]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT, &tree->vertical[1]);
+
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
+                     &tree->horizontala[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_A,
+                     &tree->horizontala[1]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_A,
+                     &tree->horizontala[2]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_HORZ_B,
+                     &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
+                     &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_HORZ_B,
+                     &tree->horizontalb[2]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
+                     &tree->verticala[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_A,
+                     &tree->verticala[1]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_A,
+                     &tree->verticala[2]);
+  alloc_mode_context(cm, num_4x4_blk / 2, PARTITION_VERT_B,
+                     &tree->verticalb[0]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
+                     &tree->verticalb[1]);
+  alloc_mode_context(cm, num_4x4_blk / 4, PARTITION_VERT_B,
+                     &tree->verticalb[2]);
+#ifdef CONFIG_SUPERTX
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ,
+                     &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_SPLIT, &tree->split_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_A,
+                     &tree->horizontala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_B,
+                     &tree->horizontalb_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_A,
+                     &tree->verticala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_B,
+                     &tree->verticalb_supertx);
+#endif  // CONFIG_SUPERTX
+#else
   alloc_mode_context(cm, num_4x4_blk, &tree->none);
   alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[0]);
   alloc_mode_context(cm, num_4x4_blk / 2, &tree->vertical[0]);
+#ifdef CONFIG_SUPERTX
+  alloc_mode_context(cm, num_4x4_blk, &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_4x4_blk, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_4x4_blk, &tree->split_supertx);
+#endif
 
   if (num_4x4_blk > 4) {
     alloc_mode_context(cm, num_4x4_blk / 2, &tree->horizontal[1]);
@@ -86,14 +154,35 @@
     memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
     memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
   }
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 static void free_tree_contexts(PC_TREE *tree) {
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+  for (i = 0; i < 3; i++) {
+    free_mode_context(&tree->horizontala[i]);
+    free_mode_context(&tree->horizontalb[i]);
+    free_mode_context(&tree->verticala[i]);
+    free_mode_context(&tree->verticalb[i]);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
   free_mode_context(&tree->none);
   free_mode_context(&tree->horizontal[0]);
   free_mode_context(&tree->horizontal[1]);
   free_mode_context(&tree->vertical[0]);
   free_mode_context(&tree->vertical[1]);
+#ifdef CONFIG_SUPERTX
+  free_mode_context(&tree->horizontal_supertx);
+  free_mode_context(&tree->vertical_supertx);
+  free_mode_context(&tree->split_supertx);
+#if CONFIG_EXT_PARTITION_TYPES
+  free_mode_context(&tree->horizontala_supertx);
+  free_mode_context(&tree->horizontalb_supertx);
+  free_mode_context(&tree->verticala_supertx);
+  free_mode_context(&tree->verticalb_supertx);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_SUPERTX
 }
 
 // This function sets up a tree of contexts such that at each square
@@ -102,8 +191,13 @@
 // represents the state of our search.
 void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
   int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#else
   const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   PICK_MODE_CONTEXT *this_leaf;
@@ -122,7 +216,13 @@
 
   // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
   // context so we only need to allocate 1 for each 8x8 block.
-  for (i = 0; i < leaf_nodes; ++i) alloc_mode_context(cm, 1, &td->leaf_tree[i]);
+  for (i = 0; i < leaf_nodes; ++i) {
+#if CONFIG_EXT_PARTITION_TYPES
+    alloc_mode_context(cm, 1, PARTITION_NONE, &td->leaf_tree[i]);
+#else
+    alloc_mode_context(cm, 1, &td->leaf_tree[i]);
+#endif
+  }
 
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
@@ -135,7 +235,7 @@
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
   // from leafs to the root.
-  for (nodes = 16; nodes > 0; nodes >>= 2) {
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
@@ -145,16 +245,30 @@
     }
     ++square_index;
   }
-  td->pc_root = &td->pc_tree[tree_nodes - 1];
-  td->pc_root[0].none.best_mode_index = 2;
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[i]->none.best_mode_index = 2;
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->pc_root[i] = td->pc_root[i + 1]->split[0];
+    td->pc_root[i]->none.best_mode_index = 2;
+  }
 }
 
 void av1_free_pc_tree(ThreadData *td) {
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
   int i;
 
   // Set up all 4x4 mode contexts
-  for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]);
+  for (i = 0; i < leaf_nodes; ++i) free_mode_context(&td->leaf_tree[i]);
 
   // Sets up all the leaf nodes in the tree.
   for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);

diff --git a/av1/encoder/context_tree.h b/av1/encoder/context_tree.h
index 4f1c647..7496d11 100644
--- a/av1/encoder/context_tree.h
+++ b/av1/encoder/context_tree.h

@@ -30,6 +30,11 @@
 #if CONFIG_PALETTE
   uint8_t *color_index_map[2];
 #endif  // CONFIG_PALETTE
+#if CONFIG_VAR_TX
+  uint8_t *blk_skip[MAX_MB_PLANE];
+#endif
+
+  // dual buffer pointers, 0: in use, 1: best in store
   tran_low_t *coeff[MAX_MB_PLANE];
   tran_low_t *qcoeff[MAX_MB_PLANE];
   tran_low_t *dqcoeff[MAX_MB_PLANE];
@@ -56,8 +61,11 @@
 
   // motion vector cache for adaptive motion search control in partition
   // search loop
-  MV pred_mv[MAX_REF_FRAMES];
+  MV pred_mv[TOTAL_REFS_PER_FRAME];
   InterpFilter pred_interp_filter;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
@@ -67,10 +75,27 @@
   PICK_MODE_CONTEXT none;
   PICK_MODE_CONTEXT horizontal[2];
   PICK_MODE_CONTEXT vertical[2];
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala[3];
+  PICK_MODE_CONTEXT horizontalb[3];
+  PICK_MODE_CONTEXT verticala[3];
+  PICK_MODE_CONTEXT verticalb[3];
+#endif
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
   };
+#ifdef CONFIG_SUPERTX
+  PICK_MODE_CONTEXT horizontal_supertx;
+  PICK_MODE_CONTEXT vertical_supertx;
+  PICK_MODE_CONTEXT split_supertx;
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala_supertx;
+  PICK_MODE_CONTEXT horizontalb_supertx;
+  PICK_MODE_CONTEXT verticala_supertx;
+  PICK_MODE_CONTEXT verticalb_supertx;
+#endif
+#endif
 } PC_TREE;
 
 void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);

diff --git a/av1/encoder/corner_detect.c b/av1/encoder/corner_detect.c
new file mode 100644
index 0000000..44747d3
--- /dev/null
+++ b/av1/encoder/corner_detect.c

@@ -0,0 +1,36 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "av1/encoder/corner_detect.h"
+
+// Fast_9 wrapper
+#define FAST_BARRIER 40
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+                       int *points, int max_points) {
+  int num_points;
+  xy *const frm_corners_xy = fast9_detect_nonmax(buf, width, height, stride,
+                                                 FAST_BARRIER, &num_points);
+  num_points = (num_points <= max_points ? num_points : max_points);
+  if (num_points > 0 && frm_corners_xy) {
+    memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points);
+    free(frm_corners_xy);
+    return num_points;
+  }
+  free(frm_corners_xy);
+  return 0;
+}

diff --git a/av1/encoder/corner_detect.h b/av1/encoder/corner_detect.h
new file mode 100644
index 0000000..257f1db
--- /dev/null
+++ b/av1/encoder/corner_detect.h

@@ -0,0 +1,21 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_ENCODER_CORNER_DETECT_H_
+#define AV1_ENCODER_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+                       int *points, int max_points);
+
+#endif  // AV1_ENCODER_CORNER_DETECT_H_

diff --git a/av1/encoder/corner_match.c b/av1/encoder/corner_match.c
new file mode 100644
index 0000000..2cb282b
--- /dev/null
+++ b/av1/encoder/corner_match.c

@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "av1/encoder/corner_match.h"
+
+#define MATCH_SZ 15
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.80
+
+static double compute_variance(unsigned char *im, int stride, int x, int y,
+                               double *mean) {
+  double sum = 0.0;
+  double sumsq = 0.0;
+  double var;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+      sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+               im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+    }
+  var = (sumsq * MATCH_SZ_SQ - sum * sum) / (MATCH_SZ_SQ * MATCH_SZ_SQ);
+  if (mean) *mean = sum / MATCH_SZ_SQ;
+  return var;
+}
+
+static double compute_cross_correlation(unsigned char *im1, int stride1, int x1,
+                                        int y1, unsigned char *im2, int stride2,
+                                        int x2, int y2) {
+  double sum1 = 0;
+  double sum2 = 0;
+  double cross = 0;
+  double corr;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      sum1 += im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+      sum2 += im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+      cross +=
+          im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)] *
+          im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+    }
+  corr = (cross * MATCH_SZ_SQ - sum1 * sum2) / (MATCH_SZ_SQ * MATCH_SZ_SQ);
+  return corr;
+}
+
+static int is_eligible_point(double pointx, double pointy, int width,
+                             int height) {
+  return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+          pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(double point1x, double point1y, double point2x,
+                                double point2y, int width, int height) {
+  const int thresh = (width < height ? height : width) >> 4;
+  return ((point1x - point2x) * (point1x - point2x) +
+          (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(unsigned char *frm, unsigned char *ref,
+                                   int width, int height, int frm_stride,
+                                   int ref_stride,
+                                   Correspondence *correspondences,
+                                   int num_correspondences) {
+  int i;
+  for (i = 0; i < num_correspondences; ++i) {
+    double template_norm =
+        compute_variance(frm, frm_stride, (int)correspondences[i].x,
+                         (int)correspondences[i].y, NULL);
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        double subimage_norm;
+        if (!is_eligible_point((int)correspondences[i].rx + x,
+                               (int)correspondences[i].ry + y, width, height))
+          continue;
+        if (!is_eligible_distance(
+                (int)correspondences[i].x, (int)correspondences[i].y,
+                (int)correspondences[i].rx + x, (int)correspondences[i].ry + y,
+                width, height))
+          continue;
+        subimage_norm =
+            compute_variance(ref, ref_stride, (int)correspondences[i].rx + x,
+                             (int)correspondences[i].ry + y, NULL);
+        match_ncc = compute_cross_correlation(
+                        frm, frm_stride, (int)correspondences[i].x,
+                        (int)correspondences[i].y, ref, ref_stride,
+                        (int)correspondences[i].rx + x,
+                        (int)correspondences[i].ry + y) /
+                    sqrt(template_norm * subimage_norm);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    }
+    correspondences[i].rx += (double)best_x;
+    correspondences[i].ry += (double)best_y;
+  }
+  for (i = 0; i < num_correspondences; ++i) {
+    double template_norm =
+        compute_variance(ref, ref_stride, (int)correspondences[i].rx,
+                         (int)correspondences[i].ry, NULL);
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        double subimage_norm;
+        if (!is_eligible_point((int)correspondences[i].x + x,
+                               (int)correspondences[i].y + y, width, height))
+          continue;
+        if (!is_eligible_distance((int)correspondences[i].x + x,
+                                  (int)correspondences[i].y + y,
+                                  (int)correspondences[i].rx,
+                                  (int)correspondences[i].ry, width, height))
+          continue;
+        subimage_norm =
+            compute_variance(frm, frm_stride, (int)correspondences[i].x + x,
+                             (int)correspondences[i].y + y, NULL);
+        match_ncc =
+            compute_cross_correlation(
+                frm, frm_stride, (int)correspondences[i].x + x,
+                (int)correspondences[i].y + y, ref, ref_stride,
+                (int)correspondences[i].rx, (int)correspondences[i].ry) /
+            sqrt(template_norm * subimage_norm);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    correspondences[i].x += best_x;
+    correspondences[i].y += best_y;
+  }
+}
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+                             int num_frm_corners, unsigned char *ref,
+                             int *ref_corners, int num_ref_corners, int width,
+                             int height, int frm_stride, int ref_stride,
+                             double *correspondence_pts) {
+  // TODO(sarahparker) Improve this to include 2-way match
+  int i, j;
+  Correspondence *correspondences = (Correspondence *)correspondence_pts;
+  int num_correspondences = 0;
+  for (i = 0; i < num_frm_corners; ++i) {
+    double best_match_ncc = 0.0;
+    double template_norm;
+    int best_match_j = -1;
+    if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+                           height))
+      continue;
+    template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
+                                     frm_corners[2 * i + 1], NULL);
+    for (j = 0; j < num_ref_corners; ++j) {
+      double match_ncc;
+      double subimage_norm;
+      if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+                             height))
+        continue;
+      if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+                                ref_corners[2 * j], ref_corners[2 * j + 1],
+                                width, height))
+        continue;
+      subimage_norm = compute_variance(ref, ref_stride, ref_corners[2 * j],
+                                       ref_corners[2 * j + 1], NULL);
+      match_ncc = compute_cross_correlation(frm, frm_stride, frm_corners[2 * i],
+                                            frm_corners[2 * i + 1], ref,
+                                            ref_stride, ref_corners[2 * j],
+                                            ref_corners[2 * j + 1]) /
+                  sqrt(template_norm * subimage_norm);
+      if (match_ncc > best_match_ncc) {
+        best_match_ncc = match_ncc;
+        best_match_j = j;
+      }
+    }
+    if (best_match_ncc > THRESHOLD_NCC) {
+      correspondences[num_correspondences].x = (double)frm_corners[2 * i];
+      correspondences[num_correspondences].y = (double)frm_corners[2 * i + 1];
+      correspondences[num_correspondences].rx =
+          (double)ref_corners[2 * best_match_j];
+      correspondences[num_correspondences].ry =
+          (double)ref_corners[2 * best_match_j + 1];
+      num_correspondences++;
+    }
+  }
+  improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+                         correspondences, num_correspondences);
+  return num_correspondences;
+}

diff --git a/av1/encoder/corner_match.h b/av1/encoder/corner_match.h
new file mode 100644
index 0000000..521c756
--- /dev/null
+++ b/av1/encoder/corner_match.h

@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_ENCODER_CORNER_MATCH_H_
+#define AV1_ENCODER_CORNER_MATCH_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+typedef struct {
+  double x, y;
+  double rx, ry;
+} Correspondence;
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+                             int num_frm_corners, unsigned char *ref,
+                             int *ref_corners, int num_ref_corners, int width,
+                             int height, int frm_stride, int ref_stride,
+                             double *correspondence_pts);
+
+#endif  // AV1_ENCODER_CORNER_MATCH_H_

diff --git a/av1/encoder/cost.c b/av1/encoder/cost.c
index 9ba30f0..e3151a5 100644
--- a/av1/encoder/cost.c
+++ b/av1/encoder/cost.c

@@ -11,11 +11,11 @@
 #include <assert.h>
 
 #include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
 
 /* round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT))
-   Begins and ends with a bogus entry to satisfy use of prob=0 in the firstpass.
-   https://code.google.com/p/webm/issues/detail?id=1089 */
-const uint16_t av1_prob_cost[257] = {
+   Begins with a bogus entry for simpler addressing. */
+const uint16_t av1_prob_cost[256] = {
   4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325, 2260,
   2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780, 1748, 1718,
   1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470, 1449, 1429, 1409,
@@ -35,7 +35,7 @@
   153,  150,  146,  143,  139,  136,  132,  129,  125,  122,  119,  115,  112,
   109,  105,  102,  99,   95,   92,   89,   86,   82,   79,   76,   73,   70,
   66,   63,   60,   57,   54,   51,   48,   45,   42,   38,   35,   32,   29,
-  26,   23,   20,   18,   15,   12,   9,    6,    3,    3
+  26,   23,   20,   18,   15,   12,   9,    6,    3
 };
 
 static void cost(int *costs, aom_tree tree, const aom_prob *probs, int i,
@@ -43,6 +43,7 @@
   const aom_prob prob = probs[i / 2];
   int b;
 
+  assert(prob != 0);
   for (b = 0; b <= 1; ++b) {
     const int cc = c + av1_cost_bit(prob, b);
     const aom_tree_index ii = tree[i + b];

diff --git a/av1/encoder/cost.h b/av1/encoder/cost.h
index c130ba1..379200e 100644
--- a/av1/encoder/cost.h
+++ b/av1/encoder/cost.h

@@ -1,12 +1,11 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #ifndef AV1_ENCODER_COST_H_
@@ -19,7 +18,7 @@
 extern "C" {
 #endif
 
-extern const uint16_t av1_prob_cost[257];
+extern const uint16_t av1_prob_cost[256];
 
 // The factor to scale from cost in bits to cost in av1_prob_cost units.
 #define AV1_PROB_COST_SHIFT 9
@@ -30,6 +29,10 @@
 
 #define av1_cost_bit(prob, bit) av1_cost_zero((bit) ? 256 - (prob) : (prob))
 
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
                                           aom_prob p) {
   return ct[0] * av1_cost_zero(p) + ct[1] * av1_cost_one(p);

diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 3bfc352..c002dab 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c

@@ -12,14 +12,15 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./av1_rtcd.h"
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
-
-#include "av1/common/blockd.h"
-#include "av1/common/idct.h"
+#include "./av1_rtcd.h"
 #include "aom_dsp/fwd_txfm.h"
 #include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/idct.h"
 
 static INLINE void range_check(const tran_low_t *input, const int size,
                                const int bit) {
@@ -38,20 +39,6 @@
 #endif
 }
 
-#if CONFIG_CB4X4
-static void fdct2(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1;
-  const tran_low_t x0 = input[0];
-  const tran_low_t x1 = input[1];
-
-  s0 = (tran_high_t)x0 + x1;
-  s1 = (tran_high_t)x0 - x1;
-
-  output[0] = (tran_low_t)s0;
-  output[1] = (tran_low_t)s1;
-}
-#endif
-
 static void fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[4];
@@ -166,149 +153,180 @@
   range_check(output, 8, 16);
 }
 
-static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
-  tran_high_t step1[8];
-  tran_high_t step2[8];
-  tran_high_t step3[8];
-  tran_high_t input[8];
-  tran_high_t temp1, temp2;
+static void fdct16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t temp;
+  tran_low_t step[16];
 
-  // step 1
-  input[0] = in[0] + in[15];
-  input[1] = in[1] + in[14];
-  input[2] = in[2] + in[13];
-  input[3] = in[3] + in[12];
-  input[4] = in[4] + in[11];
-  input[5] = in[5] + in[10];
-  input[6] = in[6] + in[9];
-  input[7] = in[7] + in[8];
+  // stage 0
+  range_check(input, 16, 13);
 
-  step1[0] = in[7] - in[8];
-  step1[1] = in[6] - in[9];
-  step1[2] = in[5] - in[10];
-  step1[3] = in[4] - in[11];
-  step1[4] = in[3] - in[12];
-  step1[5] = in[2] - in[13];
-  step1[6] = in[1] - in[14];
-  step1[7] = in[0] - in[15];
+  // stage 1
+  output[0] = input[0] + input[15];
+  output[1] = input[1] + input[14];
+  output[2] = input[2] + input[13];
+  output[3] = input[3] + input[12];
+  output[4] = input[4] + input[11];
+  output[5] = input[5] + input[10];
+  output[6] = input[6] + input[9];
+  output[7] = input[7] + input[8];
+  output[8] = input[7] - input[8];
+  output[9] = input[6] - input[9];
+  output[10] = input[5] - input[10];
+  output[11] = input[4] - input[11];
+  output[12] = input[3] - input[12];
+  output[13] = input[2] - input[13];
+  output[14] = input[1] - input[14];
+  output[15] = input[0] - input[15];
 
-  // fdct8(step, step);
-  {
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
-    tran_high_t t0, t1, t2, t3;
-    tran_high_t x0, x1, x2, x3;
+  range_check(output, 16, 14);
 
-    // stage 1
-    s0 = input[0] + input[7];
-    s1 = input[1] + input[6];
-    s2 = input[2] + input[5];
-    s3 = input[3] + input[4];
-    s4 = input[3] - input[4];
-    s5 = input[2] - input[5];
-    s6 = input[1] - input[6];
-    s7 = input[0] - input[7];
+  // stage 2
+  step[0] = output[0] + output[7];
+  step[1] = output[1] + output[6];
+  step[2] = output[2] + output[5];
+  step[3] = output[3] + output[4];
+  step[4] = output[3] - output[4];
+  step[5] = output[2] - output[5];
+  step[6] = output[1] - output[6];
+  step[7] = output[0] - output[7];
+  step[8] = output[8];
+  step[9] = output[9];
+  temp = output[10] * -cospi_16_64 + output[13] * cospi_16_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * -cospi_16_64 + output[12] * cospi_16_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_16_64 + output[11] * cospi_16_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_16_64 + output[10] * cospi_16_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  step[14] = output[14];
+  step[15] = output[15];
 
-    // fdct4(step, step);
-    x0 = s0 + s3;
-    x1 = s1 + s2;
-    x2 = s1 - s2;
-    x3 = s0 - s3;
-    t0 = (x0 + x1) * cospi_16_64;
-    t1 = (x0 - x1) * cospi_16_64;
-    t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
-    t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = (tran_low_t)fdct_round_shift(t0);
-    out[4] = (tran_low_t)fdct_round_shift(t2);
-    out[8] = (tran_low_t)fdct_round_shift(t1);
-    out[12] = (tran_low_t)fdct_round_shift(t3);
+  range_check(step, 16, 15);
 
-    // Stage 2
-    t0 = (s6 - s5) * cospi_16_64;
-    t1 = (s6 + s5) * cospi_16_64;
-    t2 = fdct_round_shift(t0);
-    t3 = fdct_round_shift(t1);
+  // stage 3
+  output[0] = step[0] + step[3];
+  output[1] = step[1] + step[2];
+  output[2] = step[1] - step[2];
+  output[3] = step[0] - step[3];
+  output[4] = step[4];
+  temp = step[5] * -cospi_16_64 + step[6] * cospi_16_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_16_64 + step[5] * cospi_16_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  output[7] = step[7];
+  output[8] = step[8] + step[11];
+  output[9] = step[9] + step[10];
+  output[10] = step[9] - step[10];
+  output[11] = step[8] - step[11];
+  output[12] = step[15] - step[12];
+  output[13] = step[14] - step[13];
+  output[14] = step[14] + step[13];
+  output[15] = step[15] + step[12];
 
-    // Stage 3
-    x0 = s4 + t2;
-    x1 = s4 - t2;
-    x2 = s7 - t3;
-    x3 = s7 + t3;
+  range_check(output, 16, 16);
 
-    // Stage 4
-    t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-    t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-    t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-    t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-    out[2] = (tran_low_t)fdct_round_shift(t0);
-    out[6] = (tran_low_t)fdct_round_shift(t2);
-    out[10] = (tran_low_t)fdct_round_shift(t1);
-    out[14] = (tran_low_t)fdct_round_shift(t3);
-  }
+  // stage 4
+  temp = output[0] * cospi_16_64 + output[1] * cospi_16_64;
+  step[0] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[1] * -cospi_16_64 + output[0] * cospi_16_64;
+  step[1] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[2] * cospi_24_64 + output[3] * cospi_8_64;
+  step[2] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[3] * cospi_24_64 + output[2] * -cospi_8_64;
+  step[3] = (tran_low_t)fdct_round_shift(temp);
+  step[4] = output[4] + output[5];
+  step[5] = output[4] - output[5];
+  step[6] = output[7] - output[6];
+  step[7] = output[7] + output[6];
+  step[8] = output[8];
+  temp = output[9] * -cospi_8_64 + output[14] * cospi_24_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * -cospi_24_64 + output[13] * -cospi_8_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  step[11] = output[11];
+  step[12] = output[12];
+  temp = output[13] * cospi_24_64 + output[10] * -cospi_8_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_8_64 + output[9] * cospi_24_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  step[15] = output[15];
 
-  // step 2
-  temp1 = (step1[5] - step1[2]) * cospi_16_64;
-  temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = fdct_round_shift(temp1);
-  step2[3] = fdct_round_shift(temp2);
-  temp1 = (step1[4] + step1[3]) * cospi_16_64;
-  temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = fdct_round_shift(temp1);
-  step2[5] = fdct_round_shift(temp2);
+  range_check(step, 16, 16);
 
-  // step 3
-  step3[0] = step1[0] + step2[3];
-  step3[1] = step1[1] + step2[2];
-  step3[2] = step1[1] - step2[2];
-  step3[3] = step1[0] - step2[3];
-  step3[4] = step1[7] - step2[4];
-  step3[5] = step1[6] - step2[5];
-  step3[6] = step1[6] + step2[5];
-  step3[7] = step1[7] + step2[4];
+  // stage 5
+  output[0] = step[0];
+  output[1] = step[1];
+  output[2] = step[2];
+  output[3] = step[3];
+  temp = step[4] * cospi_28_64 + step[7] * cospi_4_64;
+  output[4] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[5] * cospi_12_64 + step[6] * cospi_20_64;
+  output[5] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[6] * cospi_12_64 + step[5] * -cospi_20_64;
+  output[6] = (tran_low_t)fdct_round_shift(temp);
+  temp = step[7] * cospi_28_64 + step[4] * -cospi_4_64;
+  output[7] = (tran_low_t)fdct_round_shift(temp);
+  output[8] = step[8] + step[9];
+  output[9] = step[8] - step[9];
+  output[10] = step[11] - step[10];
+  output[11] = step[11] + step[10];
+  output[12] = step[12] + step[13];
+  output[13] = step[12] - step[13];
+  output[14] = step[15] - step[14];
+  output[15] = step[15] + step[14];
 
-  // step 4
-  temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
-  temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
-  step2[1] = fdct_round_shift(temp1);
-  step2[2] = fdct_round_shift(temp2);
-  temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-  temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
-  step2[5] = fdct_round_shift(temp1);
-  step2[6] = fdct_round_shift(temp2);
+  range_check(output, 16, 16);
 
-  // step 5
-  step1[0] = step3[0] + step2[1];
-  step1[1] = step3[0] - step2[1];
-  step1[2] = step3[3] + step2[2];
-  step1[3] = step3[3] - step2[2];
-  step1[4] = step3[4] - step2[5];
-  step1[5] = step3[4] + step2[5];
-  step1[6] = step3[7] - step2[6];
-  step1[7] = step3[7] + step2[6];
+  // stage 6
+  step[0] = output[0];
+  step[1] = output[1];
+  step[2] = output[2];
+  step[3] = output[3];
+  step[4] = output[4];
+  step[5] = output[5];
+  step[6] = output[6];
+  step[7] = output[7];
+  temp = output[8] * cospi_30_64 + output[15] * cospi_2_64;
+  step[8] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[9] * cospi_14_64 + output[14] * cospi_18_64;
+  step[9] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[10] * cospi_22_64 + output[13] * cospi_10_64;
+  step[10] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[11] * cospi_6_64 + output[12] * cospi_26_64;
+  step[11] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[12] * cospi_6_64 + output[11] * -cospi_26_64;
+  step[12] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[13] * cospi_22_64 + output[10] * -cospi_10_64;
+  step[13] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[14] * cospi_14_64 + output[9] * -cospi_18_64;
+  step[14] = (tran_low_t)fdct_round_shift(temp);
+  temp = output[15] * cospi_30_64 + output[8] * -cospi_2_64;
+  step[15] = (tran_low_t)fdct_round_shift(temp);
 
-  // step 6
-  temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-  temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = (tran_low_t)fdct_round_shift(temp1);
-  out[9] = (tran_low_t)fdct_round_shift(temp2);
+  range_check(step, 16, 16);
 
-  temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-  temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = (tran_low_t)fdct_round_shift(temp1);
-  out[13] = (tran_low_t)fdct_round_shift(temp2);
+  // stage 7
+  output[0] = step[0];
+  output[1] = step[8];
+  output[2] = step[4];
+  output[3] = step[12];
+  output[4] = step[2];
+  output[5] = step[10];
+  output[6] = step[6];
+  output[7] = step[14];
+  output[8] = step[1];
+  output[9] = step[9];
+  output[10] = step[5];
+  output[11] = step[13];
+  output[12] = step[3];
+  output[13] = step[11];
+  output[14] = step[7];
+  output[15] = step[15];
 
-  temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-  temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = (tran_low_t)fdct_round_shift(temp1);
-  out[11] = (tran_low_t)fdct_round_shift(temp2);
-
-  temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-  temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = (tran_low_t)fdct_round_shift(temp1);
-  out[15] = (tran_low_t)fdct_round_shift(temp2);
+  range_check(output, 16, 16);
 }
 
-// TODO(angiebird): Unify this with av1_fwd_txfm.c: av1_fdct32
-#if CONFIG_EXT_TX
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
@@ -706,23 +724,6 @@
 
   range_check(output, 32, 18);
 }
-#endif  // CONFIG_EXT_TX
-
-#if CONFIG_CB4X4
-static void fadst2(const tran_low_t *input, tran_low_t *output) {
-  tran_high_t s0, s1;
-  tran_low_t x0, x1;
-
-  x0 = input[0];
-  x1 = input[1];
-
-  s0 = sinpi_1_5 * x0 + sinpi_2_5 * x1;
-  s1 = sinpi_2_5 * x0 - sinpi_1_5 * x1;
-
-  output[0] = (tran_low_t)fdct_round_shift(s0);
-  output[1] = (tran_low_t)fdct_round_shift(s1);
-}
-#endif
 
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
@@ -998,6 +999,21 @@
   output[15] = (tran_low_t)-x1;
 }
 
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[16 + i] = input[i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
 #if CONFIG_EXT_TX
 static void fidtx4(const tran_low_t *input, tran_low_t *output) {
   int i;
@@ -1021,43 +1037,28 @@
   for (i = 0; i < 32; ++i) output[i] = input[i] * 4;
 }
 
-// For use in lieu of ADST
-static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
-  int i;
-  tran_low_t inputhalf[16];
-  for (i = 0; i < 16; ++i) {
-    output[16 + i] = input[i] * 4;
-  }
-  // Multiply input by sqrt(2)
-  for (i = 0; i < 16; ++i) {
-    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
-  }
-  fdct16(inputhalf, output);
-  // Note overall scaling factor is 4 times orthogonal
-}
-
-static void copy_block(const int16_t *src, int src_stride, int l, int16_t *dest,
-                       int dest_stride) {
+static void copy_block(const int16_t *src, int src_stride, int l, int w,
+                       int16_t *dest, int dest_stride) {
   int i;
   for (i = 0; i < l; ++i) {
-    memcpy(dest + dest_stride * i, src + src_stride * i, l * sizeof(int16_t));
+    memcpy(dest + dest_stride * i, src + src_stride * i, w * sizeof(int16_t));
   }
 }
 
-static void fliplr(int16_t *dest, int stride, int l) {
+static void fliplr(int16_t *dest, int stride, int l, int w) {
   int i, j;
   for (i = 0; i < l; ++i) {
-    for (j = 0; j < l / 2; ++j) {
+    for (j = 0; j < w / 2; ++j) {
       const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[i * stride + l - 1 - j];
-      dest[i * stride + l - 1 - j] = tmp;
+      dest[i * stride + j] = dest[i * stride + w - 1 - j];
+      dest[i * stride + w - 1 - j] = tmp;
     }
   }
 }
 
-static void flipud(int16_t *dest, int stride, int l) {
+static void flipud(int16_t *dest, int stride, int l, int w) {
   int i, j;
-  for (j = 0; j < l; ++j) {
+  for (j = 0; j < w; ++j) {
     for (i = 0; i < l / 2; ++i) {
       const int16_t tmp = dest[i * stride + j];
       dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
@@ -1066,36 +1067,36 @@
   }
 }
 
-static void fliplrud(int16_t *dest, int stride, int l) {
+static void fliplrud(int16_t *dest, int stride, int l, int w) {
   int i, j;
   for (i = 0; i < l / 2; ++i) {
-    for (j = 0; j < l; ++j) {
+    for (j = 0; j < w; ++j) {
       const int16_t tmp = dest[i * stride + j];
-      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
-      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + w - 1 - j];
+      dest[(l - 1 - i) * stride + w - 1 - j] = tmp;
     }
   }
 }
 
-static void copy_fliplr(const int16_t *src, int src_stride, int l,
+static void copy_fliplr(const int16_t *src, int src_stride, int l, int w,
                         int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplr(dest, dest_stride, l);
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  fliplr(dest, dest_stride, l, w);
 }
 
-static void copy_flipud(const int16_t *src, int src_stride, int l,
+static void copy_flipud(const int16_t *src, int src_stride, int l, int w,
                         int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  flipud(dest, dest_stride, l);
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  flipud(dest, dest_stride, l, w);
 }
 
-static void copy_fliplrud(const int16_t *src, int src_stride, int l,
+static void copy_fliplrud(const int16_t *src, int src_stride, int l, int w,
                           int16_t *dest, int dest_stride) {
-  copy_block(src, src_stride, l, dest, dest_stride);
-  fliplrud(dest, dest_stride, l);
+  copy_block(src, src_stride, l, w, dest, dest_stride);
+  fliplrud(dest, dest_stride, l, w);
 }
 
-static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l, int w,
                              int16_t *buff, int tx_type) {
   switch (tx_type) {
     case DCT_DCT:
@@ -1110,130 +1111,60 @@
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
-      copy_flipud(*src, *src_stride, l, buff, l);
+      copy_flipud(*src, *src_stride, l, w, buff, w);
       *src = buff;
-      *src_stride = l;
+      *src_stride = w;
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
-      copy_fliplr(*src, *src_stride, l, buff, l);
+      copy_fliplr(*src, *src_stride, l, w, buff, w);
       *src = buff;
-      *src_stride = l;
+      *src_stride = w;
       break;
     case FLIPADST_FLIPADST:
-      copy_fliplrud(*src, *src_stride, l, buff, l);
+      copy_fliplrud(*src, *src_stride, l, w, buff, w);
       *src = buff;
-      *src_stride = l;
+      *src_stride = w;
       break;
     default: assert(0); break;
   }
 }
 #endif  // CONFIG_EXT_TX
 
-#if CONFIG_CB4X4
-static const transform_2d FHT_2[] = {
-  { fdct2, fdct2 }, { fadst2, fdct2 }, { fdct2, fadst2 }, { fadst2, fadst2 },
-};
-#endif
-
-static const transform_2d FHT_4[] = {
-  { fdct4, fdct4 },    // DCT_DCT  = 0
-  { fadst4, fdct4 },   // ADST_DCT = 1
-  { fdct4, fadst4 },   // DCT_ADST = 2
-  { fadst4, fadst4 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-  { fadst4, fdct4 },   // FLIPADST_DCT = 4
-  { fdct4, fadst4 },   // DCT_FLIPADST = 5
-  { fadst4, fadst4 },  // FLIPADST_FLIPADST = 6
-  { fadst4, fadst4 },  // ADST_FLIPADST = 7
-  { fadst4, fadst4 },  // FLIPADST_ADST = 8
-  { fidtx4, fidtx4 },  // IDTX = 9
-  { fdct4, fidtx4 },   // V_DCT = 10
-  { fidtx4, fdct4 },   // H_DCT = 11
-  { fadst4, fidtx4 },  // V_ADST = 12
-  { fidtx4, fadst4 },  // H_ADST = 13
-  { fadst4, fidtx4 },  // V_FLIPADST = 14
-  { fidtx4, fadst4 },  // H_FLIPADST = 15
-#endif                 // CONFIG_EXT_TX
-};
-
-static const transform_2d FHT_8[] = {
-  { fdct8, fdct8 },    // DCT_DCT  = 0
-  { fadst8, fdct8 },   // ADST_DCT = 1
-  { fdct8, fadst8 },   // DCT_ADST = 2
-  { fadst8, fadst8 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-  { fadst8, fdct8 },   // FLIPADST_DCT = 4
-  { fdct8, fadst8 },   // DCT_FLIPADST = 5
-  { fadst8, fadst8 },  // FLIPADST_FLIPADST = 6
-  { fadst8, fadst8 },  // ADST_FLIPADST = 7
-  { fadst8, fadst8 },  // FLIPADST_ADST = 8
-  { fidtx8, fidtx8 },  // IDTX = 9
-  { fdct8, fidtx8 },   // V_DCT = 10
-  { fidtx8, fdct8 },   // H_DCT = 11
-  { fadst8, fidtx8 },  // V_ADST = 12
-  { fidtx8, fadst8 },  // H_ADST = 13
-  { fadst8, fidtx8 },  // V_FLIPADST = 14
-  { fidtx8, fadst8 },  // H_FLIPADST = 15
-#endif                 // CONFIG_EXT_TX
-};
-
-static const transform_2d FHT_16[] = {
-  { fdct16, fdct16 },    // DCT_DCT  = 0
-  { fadst16, fdct16 },   // ADST_DCT = 1
-  { fdct16, fadst16 },   // DCT_ADST = 2
-  { fadst16, fadst16 },  // ADST_ADST = 3
-#if CONFIG_EXT_TX
-  { fadst16, fdct16 },   // FLIPADST_DCT = 4
-  { fdct16, fadst16 },   // DCT_FLIPADST = 5
-  { fadst16, fadst16 },  // FLIPADST_FLIPADST = 6
-  { fadst16, fadst16 },  // ADST_FLIPADST = 7
-  { fadst16, fadst16 },  // FLIPADST_ADST = 8
-  { fidtx16, fidtx16 },  // IDTX = 9
-  { fdct16, fidtx16 },   // V_DCT = 10
-  { fidtx16, fdct16 },   // H_DCT = 11
-  { fadst16, fidtx16 },  // V_ADST = 12
-  { fidtx16, fadst16 },  // H_ADST = 13
-  { fadst16, fidtx16 },  // V_FLIPADST = 14
-  { fidtx16, fadst16 },  // H_FLIPADST = 15
-#endif                   // CONFIG_EXT_TX
-};
-
-#if CONFIG_EXT_TX
-static const transform_2d FHT_32[] = {
-  { fdct32, fdct32 },              // DCT_DCT = 0
-  { fhalfright32, fdct32 },        // ADST_DCT = 1
-  { fdct32, fhalfright32 },        // DCT_ADST = 2
-  { fhalfright32, fhalfright32 },  // ADST_ADST = 3
-  { fhalfright32, fdct32 },        // FLIPADST_DCT = 4
-  { fdct32, fhalfright32 },        // DCT_FLIPADST = 5
-  { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST = 6
-  { fhalfright32, fhalfright32 },  // ADST_FLIPADST = 7
-  { fhalfright32, fhalfright32 },  // FLIPADST_ADST = 8
-  { fidtx32, fidtx32 },            // IDTX = 9
-  { fdct32, fidtx32 },             // V_DCT = 10
-  { fidtx32, fdct32 },             // H_DCT = 11
-  { fhalfright32, fidtx32 },       // V_ADST = 12
-  { fidtx32, fhalfright32 },       // H_ADST = 13
-  { fhalfright32, fidtx32 },       // V_FLIPADST = 14
-  { fidtx32, fhalfright32 },       // H_FLIPADST = 15
-};
-#endif  // CONFIG_EXT_TX
-
 void av1_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
                   int tx_type) {
   if (tx_type == DCT_DCT) {
     aom_fdct4x4_c(input, output, stride);
   } else {
+    static const transform_2d FHT[] = {
+      { fdct4, fdct4 },    // DCT_DCT
+      { fadst4, fdct4 },   // ADST_DCT
+      { fdct4, fadst4 },   // DCT_ADST
+      { fadst4, fadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+      { fadst4, fdct4 },   // FLIPADST_DCT
+      { fdct4, fadst4 },   // DCT_FLIPADST
+      { fadst4, fadst4 },  // FLIPADST_FLIPADST
+      { fadst4, fadst4 },  // ADST_FLIPADST
+      { fadst4, fadst4 },  // FLIPADST_ADST
+      { fidtx4, fidtx4 },  // IDTX
+      { fdct4, fidtx4 },   // V_DCT
+      { fidtx4, fdct4 },   // H_DCT
+      { fadst4, fidtx4 },  // V_ADST
+      { fidtx4, fadst4 },  // H_ADST
+      { fadst4, fidtx4 },  // V_FLIPADST
+      { fidtx4, fadst4 },  // H_FLIPADST
+#endif                     // CONFIG_EXT_TX
+    };
+    const transform_2d ht = FHT[tx_type];
     tran_low_t out[4 * 4];
     int i, j;
     tran_low_t temp_in[4], temp_out[4];
-    const transform_2d ht = FHT_4[tx_type];
 
 #if CONFIG_EXT_TX
     int16_t flipped_input[4 * 4];
-    maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
+    maybe_flip_input(&input, &stride, 4, 4, flipped_input, tx_type);
 #endif
 
     // Columns
@@ -1253,6 +1184,316 @@
   }
 }
 
+void av1_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
+                  int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct8, fdct4 },    // DCT_DCT
+    { fadst8, fdct4 },   // ADST_DCT
+    { fdct8, fadst4 },   // DCT_ADST
+    { fadst8, fadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst8, fdct4 },   // FLIPADST_DCT
+    { fdct8, fadst4 },   // DCT_FLIPADST
+    { fadst8, fadst4 },  // FLIPADST_FLIPADST
+    { fadst8, fadst4 },  // ADST_FLIPADST
+    { fadst8, fadst4 },  // FLIPADST_ADST
+    { fidtx8, fidtx4 },  // IDTX
+    { fdct8, fidtx4 },   // V_DCT
+    { fidtx8, fdct4 },   // H_DCT
+    { fadst8, fidtx4 },  // V_ADST
+    { fidtx8, fadst4 },  // H_ADST
+    { fadst8, fidtx4 },  // V_FLIPADST
+    { fidtx8, fadst4 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 4;
+  const int n2 = 8;
+  tran_low_t out[8 * 4];
+  tran_low_t temp_in[8], temp_out[8];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[8 * 4];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[j * stride + i] * 8 * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j) output[j + i * n] = (temp_out[j] + 1) >> 2;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
+                  int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct4, fdct8 },    // DCT_DCT
+    { fadst4, fdct8 },   // ADST_DCT
+    { fdct4, fadst8 },   // DCT_ADST
+    { fadst4, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst4, fdct8 },   // FLIPADST_DCT
+    { fdct4, fadst8 },   // DCT_FLIPADST
+    { fadst4, fadst8 },  // FLIPADST_FLIPADST
+    { fadst4, fadst8 },  // ADST_FLIPADST
+    { fadst4, fadst8 },  // FLIPADST_ADST
+    { fidtx4, fidtx8 },  // IDTX
+    { fdct4, fidtx8 },   // V_DCT
+    { fidtx4, fdct8 },   // H_DCT
+    { fadst4, fidtx8 },  // V_ADST
+    { fidtx4, fadst8 },  // H_ADST
+    { fadst4, fidtx8 },  // V_FLIPADST
+    { fidtx4, fadst8 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 4;
+  const int n2 = 8;
+  tran_low_t out[8 * 4];
+  tran_low_t temp_in[8], temp_out[8];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[8 * 4];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[j * stride + i] * 8 * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) output[j + i * n2] = (temp_out[j] + 1) >> 2;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct16, fdct8 },    // DCT_DCT
+    { fadst16, fdct8 },   // ADST_DCT
+    { fdct16, fadst8 },   // DCT_ADST
+    { fadst16, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst16, fdct8 },   // FLIPADST_DCT
+    { fdct16, fadst8 },   // DCT_FLIPADST
+    { fadst16, fadst8 },  // FLIPADST_FLIPADST
+    { fadst16, fadst8 },  // ADST_FLIPADST
+    { fadst16, fadst8 },  // FLIPADST_ADST
+    { fidtx16, fidtx8 },  // IDTX
+    { fdct16, fidtx8 },   // V_DCT
+    { fidtx16, fdct8 },   // H_DCT
+    { fadst16, fidtx8 },  // V_ADST
+    { fidtx16, fadst8 },  // H_ADST
+    { fadst16, fidtx8 },  // V_FLIPADST
+    { fidtx16, fadst8 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 8;
+  const int n2 = 16;
+  tran_low_t out[16 * 8];
+  tran_low_t temp_in[16], temp_out[16];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[16 * 8];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j)
+      temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2,
+                                             DCT_CONST_BITS);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      output[j + i * n] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
+                   int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct8, fdct16 },    // DCT_DCT
+    { fadst8, fdct16 },   // ADST_DCT
+    { fdct8, fadst16 },   // DCT_ADST
+    { fadst8, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst8, fdct16 },   // FLIPADST_DCT
+    { fdct8, fadst16 },   // DCT_FLIPADST
+    { fadst8, fadst16 },  // FLIPADST_FLIPADST
+    { fadst8, fadst16 },  // ADST_FLIPADST
+    { fadst8, fadst16 },  // FLIPADST_ADST
+    { fidtx8, fidtx16 },  // IDTX
+    { fdct8, fidtx16 },   // V_DCT
+    { fidtx8, fdct16 },   // H_DCT
+    { fadst8, fidtx16 },  // V_ADST
+    { fidtx8, fadst16 },  // H_ADST
+    { fadst8, fidtx16 },  // V_FLIPADST
+    { fidtx8, fadst16 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 8;
+  const int n2 = 16;
+  tran_low_t out[16 * 8];
+  tran_low_t temp_in[16], temp_out[16];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[16 * 8];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] = ROUND_POWER_OF_TWO_SIGNED(input[j * stride + i] * 4 * Sqrt2,
+                                             DCT_CONST_BITS);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+  }
+  // Note: overall scale factor of transform is 8 times unitary
+}
+
+void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct32, fdct16 },         // DCT_DCT
+    { fhalfright32, fdct16 },   // ADST_DCT
+    { fdct32, fadst16 },        // DCT_ADST
+    { fhalfright32, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fhalfright32, fdct16 },   // FLIPADST_DCT
+    { fdct32, fadst16 },        // DCT_FLIPADST
+    { fhalfright32, fadst16 },  // FLIPADST_FLIPADST
+    { fhalfright32, fadst16 },  // ADST_FLIPADST
+    { fhalfright32, fadst16 },  // FLIPADST_ADST
+    { fidtx32, fidtx16 },       // IDTX
+    { fdct32, fidtx16 },        // V_DCT
+    { fidtx32, fdct16 },        // H_DCT
+    { fhalfright32, fidtx16 },  // V_ADST
+    { fidtx32, fadst16 },       // H_ADST
+    { fhalfright32, fidtx16 },  // V_FLIPADST
+    { fidtx32, fadst16 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 16;
+  const int n2 = 32;
+  tran_low_t out[32 * 16];
+  tran_low_t temp_in[32], temp_out[32];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 16];
+  maybe_flip_input(&input, &stride, n2, n, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j)
+      temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n; ++j)
+      output[j + i * n] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+  // Note: overall scale factor of transform is 4 times unitary
+}
+
+void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct16, fdct32 },         // DCT_DCT
+    { fadst16, fdct32 },        // ADST_DCT
+    { fdct16, fhalfright32 },   // DCT_ADST
+    { fadst16, fhalfright32 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst16, fdct32 },        // FLIPADST_DCT
+    { fdct16, fhalfright32 },   // DCT_FLIPADST
+    { fadst16, fhalfright32 },  // FLIPADST_FLIPADST
+    { fadst16, fhalfright32 },  // ADST_FLIPADST
+    { fadst16, fhalfright32 },  // FLIPADST_ADST
+    { fidtx16, fidtx32 },       // IDTX
+    { fdct16, fidtx32 },        // V_DCT
+    { fidtx16, fdct32 },        // H_DCT
+    { fadst16, fidtx32 },       // V_ADST
+    { fidtx16, fhalfright32 },  // H_ADST
+    { fadst16, fidtx32 },       // V_FLIPADST
+    { fidtx16, fhalfright32 },  // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  const int n = 16;
+  const int n2 = 32;
+  tran_low_t out[32 * 16];
+  tran_low_t temp_in[32], temp_out[32];
+  int i, j;
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 16];
+  maybe_flip_input(&input, &stride, n, n2, flipped_input, tx_type);
+#endif
+
+  // Columns
+  for (i = 0; i < n2; ++i) {
+    for (j = 0; j < n; ++j)
+      temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+  }
+
+  // Rows
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+  // Note: overall scale factor of transform is 4 times unitary
+}
+
 void av1_fdct8x8_quant_c(const int16_t *input, int stride,
                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
@@ -1383,14 +1624,34 @@
   if (tx_type == DCT_DCT) {
     aom_fdct8x8_c(input, output, stride);
   } else {
+    static const transform_2d FHT[] = {
+      { fdct8, fdct8 },    // DCT_DCT
+      { fadst8, fdct8 },   // ADST_DCT
+      { fdct8, fadst8 },   // DCT_ADST
+      { fadst8, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+      { fadst8, fdct8 },   // FLIPADST_DCT
+      { fdct8, fadst8 },   // DCT_FLIPADST
+      { fadst8, fadst8 },  // FLIPADST_FLIPADST
+      { fadst8, fadst8 },  // ADST_FLIPADST
+      { fadst8, fadst8 },  // FLIPADST_ADST
+      { fidtx8, fidtx8 },  // IDTX
+      { fdct8, fidtx8 },   // V_DCT
+      { fidtx8, fdct8 },   // H_DCT
+      { fadst8, fidtx8 },  // V_ADST
+      { fidtx8, fadst8 },  // H_ADST
+      { fadst8, fidtx8 },  // V_FLIPADST
+      { fidtx8, fadst8 },  // H_FLIPADST
+#endif                     // CONFIG_EXT_TX
+    };
+    const transform_2d ht = FHT[tx_type];
     tran_low_t out[64];
     int i, j;
     tran_low_t temp_in[8], temp_out[8];
-    const transform_2d ht = FHT_8[tx_type];
 
 #if CONFIG_EXT_TX
     int16_t flipped_input[8 * 8];
-    maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
+    maybe_flip_input(&input, &stride, 8, 8, flipped_input, tx_type);
 #endif
 
     // Columns
@@ -1468,33 +1729,50 @@
 
 void av1_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
                     int tx_type) {
-  if (tx_type == DCT_DCT) {
-    aom_fdct16x16_c(input, output, stride);
-  } else {
-    tran_low_t out[256];
-    int i, j;
-    tran_low_t temp_in[16], temp_out[16];
-    const transform_2d ht = FHT_16[tx_type];
+  static const transform_2d FHT[] = {
+    { fdct16, fdct16 },    // DCT_DCT
+    { fadst16, fdct16 },   // ADST_DCT
+    { fdct16, fadst16 },   // DCT_ADST
+    { fadst16, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { fadst16, fdct16 },   // FLIPADST_DCT
+    { fdct16, fadst16 },   // DCT_FLIPADST
+    { fadst16, fadst16 },  // FLIPADST_FLIPADST
+    { fadst16, fadst16 },  // ADST_FLIPADST
+    { fadst16, fadst16 },  // FLIPADST_ADST
+    { fidtx16, fidtx16 },  // IDTX
+    { fdct16, fidtx16 },   // V_DCT
+    { fidtx16, fdct16 },   // H_DCT
+    { fadst16, fidtx16 },  // V_ADST
+    { fidtx16, fadst16 },  // H_ADST
+    { fadst16, fidtx16 },  // V_FLIPADST
+    { fidtx16, fadst16 },  // H_FLIPADST
+#endif                     // CONFIG_EXT_TX
+  };
+
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[256];
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
 
 #if CONFIG_EXT_TX
-    int16_t flipped_input[16 * 16];
-    maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
+  int16_t flipped_input[16 * 16];
+  maybe_flip_input(&input, &stride, 16, 16, flipped_input, tx_type);
 #endif
 
-    // Columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
-      ht.cols(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
-    }
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+  }
 
-    // Rows
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
-      ht.rows(temp_in, temp_out);
-      for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
-    }
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
   }
 }
 
@@ -1504,6 +1782,36 @@
   av1_fht4x4_c(input, output, stride, tx_type);
 }
 
+void av1_highbd_fht4x8_c(const int16_t *input, tran_low_t *output, int stride,
+                         int tx_type) {
+  av1_fht4x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x4_c(const int16_t *input, tran_low_t *output, int stride,
+                         int tx_type) {
+  av1_fht8x4_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht8x16_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
+                          int tx_type) {
+  av1_fht16x8_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht16x32_c(input, output, stride, tx_type);
+}
+
+void av1_highbd_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht32x16_c(input, output, stride, tx_type);
+}
+
 void av1_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
                          int tx_type) {
   av1_fht8x8_c(input, output, stride, tx_type);
@@ -1519,3 +1827,191 @@
   av1_fht16x16_c(input, output, stride, tx_type);
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+
+// TODO(luoyi): Adding this function to avoid DCT_DCT overflow.
+// Remove this function after we scale the column txfm output correctly.
+static INLINE int range_check_dct32x32(const int16_t *input, int16_t bound,
+                                       int size) {
+  int i;
+  for (i = 0; i < size; ++i) {
+    if (abs(input[i]) > bound) return 1;
+  }
+  return 0;
+}
+
+void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct32, fdct32 },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright32, fdct32 },        // ADST_DCT
+    { fdct32, fhalfright32 },        // DCT_ADST
+    { fhalfright32, fhalfright32 },  // ADST_ADST
+    { fhalfright32, fdct32 },        // FLIPADST_DCT
+    { fdct32, fhalfright32 },        // DCT_FLIPADST
+    { fhalfright32, fhalfright32 },  // FLIPADST_FLIPADST
+    { fhalfright32, fhalfright32 },  // ADST_FLIPADST
+    { fhalfright32, fhalfright32 },  // FLIPADST_ADST
+    { fidtx32, fidtx32 },            // IDTX
+    { fdct32, fidtx32 },             // V_DCT
+    { fidtx32, fdct32 },             // H_DCT
+    { fhalfright32, fidtx32 },       // V_ADST
+    { fidtx32, fhalfright32 },       // H_ADST
+    { fhalfright32, fidtx32 },       // V_FLIPADST
+    { fidtx32, fhalfright32 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[1024];
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+
+#if CONFIG_EXT_TX
+  int16_t flipped_input[32 * 32];
+  maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
+#endif
+
+  if (DCT_DCT == tx_type) {
+    if (range_check_dct32x32(input, (1 << 6) - 1, 1 << 10)) {
+      aom_fdct32x32_c(input, output, stride);
+      return;
+    }
+  }
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 32; ++j)
+      output[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+
+#if CONFIG_TX64X64
+#if CONFIG_EXT_TX
+static void fidtx64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 64; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+}
+
+// For use in lieu of ADST
+static void fhalfright64(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[32];
+  for (i = 0; i < 32; ++i) {
+    output[32 + i] = (tran_low_t)fdct_round_shift(input[i] * 4 * Sqrt2);
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 32; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 32] * Sqrt2);
+  }
+  fdct32(inputhalf, output);
+  // Note overall scaling factor is 2 times unitary
+}
+#endif  // CONFIG_EXT_TX
+
+static void fdct64_col(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_col_dct_dct_64,
+                 fwd_stage_range_col_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+static void fdct64_row(const tran_low_t *input, tran_low_t *output) {
+  int32_t in[64], out[64];
+  int i;
+  for (i = 0; i < 64; ++i) in[i] = (int32_t)input[i];
+  av1_fdct64_new(in, out, fwd_cos_bit_row_dct_dct_64,
+                 fwd_stage_range_row_dct_dct_64);
+  for (i = 0; i < 64; ++i) output[i] = (tran_low_t)out[i];
+}
+
+void av1_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                    int tx_type) {
+  static const transform_2d FHT[] = {
+    { fdct64_col, fdct64_row },  // DCT_DCT
+#if CONFIG_EXT_TX
+    { fhalfright64, fdct64_row },    // ADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_ADST
+    { fhalfright64, fhalfright64 },  // ADST_ADST
+    { fhalfright64, fdct64_row },    // FLIPADST_DCT
+    { fdct64_col, fhalfright64 },    // DCT_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // ADST_FLIPADST
+    { fhalfright64, fhalfright64 },  // FLIPADST_ADST
+    { fidtx64, fidtx64 },            // IDTX
+    { fdct64_col, fidtx64 },         // V_DCT
+    { fidtx64, fdct64_row },         // H_DCT
+    { fhalfright64, fidtx64 },       // V_ADST
+    { fidtx64, fhalfright64 },       // H_ADST
+    { fhalfright64, fidtx64 },       // V_FLIPADST
+    { fidtx64, fhalfright64 },       // H_FLIPADST
+#endif
+  };
+  const transform_2d ht = FHT[tx_type];
+  tran_low_t out[4096];
+  int i, j;
+  tran_low_t temp_in[64], temp_out[64];
+#if CONFIG_EXT_TX
+  int16_t flipped_input[64 * 64];
+  maybe_flip_input(&input, &stride, 64, 64, flipped_input, tx_type);
+#endif
+  // Columns
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
+    ht.cols(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  }
+
+  // Rows
+  for (i = 0; i < 64; ++i) {
+    for (j = 0; j < 64; ++j) temp_in[j] = out[j + i * 64];
+    ht.rows(temp_in, temp_out);
+    for (j = 0; j < 64; ++j)
+      output[j + i * 64] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+  }
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_EXT_TX
+// Forward identity transform.
+void av1_fwd_idtx_c(const int16_t *src_diff, tran_low_t *coeff, int stride,
+                    int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : (bs < 64 ? 2 : 1);
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] * (1 << shift);
+      src_diff += stride;
+      coeff += bs;
+    }
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+void av1_highbd_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht32x32_c(input, output, stride, tx_type);
+}
+
+#if CONFIG_TX64X64
+void av1_highbd_fht64x64_c(const int16_t *input, tran_low_t *output, int stride,
+                           int tx_type) {
+  av1_fht64x64_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_TX64X64
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX

diff --git a/av1/encoder/encodeframe.c b/av1/encoder/encodeframe.c
index f3cb422..baf61e9 100644
--- a/av1/encoder/encodeframe.c
+++ b/av1/encoder/encodeframe.c

@@ -37,6 +37,13 @@
 #include "av1/encoder/aq_complexity.h"
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
+#if CONFIG_SUPERTX
+#include "av1/encoder/cost.h"
+#endif
+#if CONFIG_GLOBAL_MOTION
+#include "av1/common/warped_motion.h"
+#include "av1/encoder/global_motion.h"
+#endif
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/encodemv.h"
@@ -46,38 +53,95 @@
 #include "av1/encoder/rdopt.h"
 #include "av1/encoder/segmentation.h"
 #include "av1/encoder/tokenize.h"
-
 #if CONFIG_PVQ
 #include "av1/encoder/pvq_encoder.h"
 #endif
+#if CONFIG_AOM_HIGHBITDEPTH
+#define IF_HBD(...) __VA_ARGS__
+#else
+#define IF_HBD(...)
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 
 static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                              TOKENEXTRA **t, int output_enabled, int mi_row,
+                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
                               int mi_col, BLOCK_SIZE bsize,
-                              PICK_MODE_CONTEXT *ctx);
+                              PICK_MODE_CONTEXT *ctx, int *rate);
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx);
+
+static int check_intra_sb(const AV1_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree);
+static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+                               int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree);
+static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
+                               const TileInfo *const tile, int mi_row,
+                               int mi_col, int mi_row_ori, int mi_col_ori,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
+                               int dst_stride[3], PC_TREE *pc_tree);
+static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                    const TileInfo *const tile, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize,
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree);
+static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
+                          const TileInfo *const tile, int mi_row, int mi_col,
+                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
+                          TX_TYPE *best_tx, PC_TREE *pc_tree);
+#endif  // CONFIG_SUPERTX
 
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
-static const uint8_t AV1_VAR_OFFS[64] = {
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
 };
 
 #if CONFIG_AOM_HIGHBITDEPTH
-static const uint16_t AV1_HIGH_VAR_OFFS_8[64] = {
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
   128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
 };
 
-static const uint16_t AV1_HIGH_VAR_OFFS_10[64] = {
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+  128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+#if CONFIG_EXT_PARTITION
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
@@ -86,9 +150,21 @@
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
   128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+#endif  // CONFIG_EXT_PARTITION
 };
 
-static const uint16_t AV1_HIGH_VAR_OFFS_12[64] = {
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+  128 * 16,
+#if CONFIG_EXT_PARTITION
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
@@ -99,6 +175,7 @@
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
   128 * 16
+#endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
@@ -169,10 +246,9 @@
 
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
-static INLINE void set_mode_info_offsets(const AV1_COMP *const cpi,
-                                         MACROBLOCK *const x,
-                                         MACROBLOCKD *const xd, int mi_row,
-                                         int mi_col) {
+static void set_mode_info_offsets(const AV1_COMP *const cpi,
+                                  MACROBLOCK *const x, MACROBLOCKD *const xd,
+                                  int mi_row, int mi_col) {
   const AV1_COMMON *const cm = &cpi->common;
   const int idx_str = xd->mi_stride * mi_row + mi_col;
   xd->mi = cm->mi_grid_visible + idx_str;
@@ -180,23 +256,27 @@
   x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 }
 
-static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
-                        MACROBLOCK *const x, int mi_row, int mi_col,
-                        BLOCK_SIZE bsize) {
+static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
+                                           const TileInfo *const tile,
+                                           MACROBLOCK *const x, int mi_row,
+                                           int mi_col, BLOCK_SIZE bsize) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int bwl = b_width_log2_lookup[AOMMAX(bsize, BLOCK_8X8)];
   const int bhl = b_height_log2_lookup[AOMMAX(bsize, BLOCK_8X8)];
-  const struct segmentation *const seg = &cm->seg;
 
   set_skip_context(xd, mi_row, mi_col);
 
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
 
-  mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
 
   // Set up destination pointers.
   av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
@@ -222,22 +302,126 @@
   x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
 
+  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const struct segmentation *const seg = &cm->seg;
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  mbmi = &xd->mi[0]->mbmi;
+
   // Setup segment ID.
   if (seg->enabled) {
-    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
+    if (!cpi->vaq_refresh) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
       mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-    av1_init_plane_quantizers(cpi, x);
+    av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
   } else {
     mbmi->segment_id = 0;
   }
 
-  // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
-  xd->tile = *tile;
+#if CONFIG_SUPERTX
+  mbmi->segment_id_supertx = MAX_SEGMENTS;
+#endif  // CONFIG_SUPERTX
 }
 
+#if CONFIG_SUPERTX
+static void set_offsets_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                const TileInfo *const tile, int mi_row,
+                                int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &td->mb;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
+                 cm->mi_cols);
+}
+
+static void set_offsets_extend(const AV1_COMP *const cpi, ThreadData *td,
+                               const TileInfo *const tile, int mi_row_pred,
+                               int mi_col_pred, int mi_row_ori, int mi_col_ori,
+                               BLOCK_SIZE bsize_pred) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  MACROBLOCK *const x = &td->mb;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize_pred];
+
+  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_row_min = -(((mi_row_pred + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col_pred + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row_pred) * MI_SIZE + AOM_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col_pred) * MI_SIZE + AOM_INTERP_EXTEND;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col_pred & (mi_width - 1)) && !(mi_row_pred & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
+                 cm->mi_rows, cm->mi_cols);
+  xd->up_available = (mi_row_ori > tile->mi_row_start);
+  xd->left_available = (mi_col_ori > tile->mi_col_start);
+
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
+}
+
+static void set_segment_id_supertx(const AV1_COMP *const cpi,
+                                   MACROBLOCK *const x, const int mi_row,
+                                   const int mi_col, const BLOCK_SIZE bsize) {
+  const AV1_COMMON *cm = &cpi->common;
+  const struct segmentation *seg = &cm->seg;
+  const int miw =
+      AOMMIN(num_8x8_blocks_wide_lookup[bsize], cm->mi_cols - mi_col);
+  const int mih =
+      AOMMIN(num_8x8_blocks_high_lookup[bsize], cm->mi_rows - mi_row);
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
+  int r, c;
+  int seg_id_supertx = MAX_SEGMENTS;
+
+  if (!seg->enabled) {
+    seg_id_supertx = 0;
+  } else {
+    // Find the minimum segment_id
+    for (r = 0; r < mih; r++)
+      for (c = 0; c < miw; c++)
+        seg_id_supertx =
+            AOMMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id, seg_id_supertx);
+    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
+
+    // Initialize plane quantisers
+    av1_init_plane_quantizers(cpi, x, seg_id_supertx);
+  }
+
+  // Assign the the segment_id back to segment_id_supertx
+  for (r = 0; r < mih; r++)
+    for (c = 0; c < miw; c++)
+      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+}
+#endif  // CONFIG_SUPERTX
+
 static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
                            MACROBLOCKD *const xd, int mi_row, int mi_col,
                            BLOCK_SIZE bsize) {
@@ -247,210 +431,94 @@
   }
 }
 
-typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
-  int log2_count;
-  int variance;
-} var;
-
-typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
-} partition_variance;
-
-typedef struct {
-  partition_variance part_variances;
-  var split[4];
-} v4x4;
-
-typedef struct {
-  partition_variance part_variances;
-  v4x4 split[4];
-} v8x8;
-
-typedef struct {
-  partition_variance part_variances;
-  v8x8 split[4];
-} v16x16;
-
-typedef struct {
-  partition_variance part_variances;
-  v16x16 split[4];
-} v32x32;
-
-typedef struct {
-  partition_variance part_variances;
-  v32x32 split[4];
-} v64x64;
-
-typedef struct {
-  partition_variance *part_variances;
-  var *split[4];
-} variance_node;
-
-typedef enum {
-  V16X16,
-  V32X32,
-  V64X64,
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
-  int i;
-  node->part_variances = NULL;
-  switch (bsize) {
-    case BLOCK_64X64: {
-      v64x64 *vt = (v64x64 *)data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_32X32: {
-      v32x32 *vt = (v32x32 *)data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_16X16: {
-      v16x16 *vt = (v16x16 *)data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_8X8: {
-      v8x8 *vt = (v8x8 *)data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_4X4: {
-      v4x4 *vt = (v4x4 *)data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
-      break;
-    }
-    default: {
-      assert(0);
-      break;
-    }
-  }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
-  v->sum_square_error = s2;
-  v->sum_error = s;
-  v->log2_count = c;
-}
-
-static void get_variance(var *v) {
-  v->variance =
-      (int)(256 * (v->sum_square_error -
-                   ((v->sum_error * v->sum_error) >> v->log2_count)) >>
-            v->log2_count);
-}
-
-static void sum_2_variances(const var *a, const var *b, var *r) {
-  assert(a->log2_count == b->log2_count);
-  fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->log2_count + 1, r);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
-  variance_node node;
-  memset(&node, 0, sizeof(node));
-  tree_to_node(data, bsize, &node);
-  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
-  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
-  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
-  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
-  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
-                  &node.part_variances->none);
-}
-
-static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
-                               MACROBLOCKD *const xd, void *data,
-                               BLOCK_SIZE bsize, int mi_row, int mi_col,
-                               int64_t threshold, BLOCK_SIZE bsize_min,
-                               int force_split) {
+static void set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
+                                MACROBLOCKD *const xd, VAR_TREE *vt, int mi_row,
+                                int mi_col, const int64_t *const threshold,
+                                const BLOCK_SIZE *const bsize_min) {
   AV1_COMMON *const cm = &cpi->common;
-  variance_node vt;
-  const int block_width = num_8x8_blocks_wide_lookup[bsize];
-  const int block_height = num_8x8_blocks_high_lookup[bsize];
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  const int hbw = num_8x8_blocks_wide_lookup[vt->bsize] / 2;
+  const int hbh = num_8x8_blocks_high_lookup[vt->bsize] / 2;
+  const int has_cols = mi_col + hbw < cm->mi_cols;
+  const int has_rows = mi_row + hbh < cm->mi_rows;
 
-  assert(block_height == block_width);
-  tree_to_node(data, bsize, &vt);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  if (force_split == 1) return 0;
+  assert(vt->bsize >= BLOCK_8X8);
+
+  assert(hbh == hbw);
+
+  if (vt->bsize == BLOCK_8X8 && cm->frame_type != KEY_FRAME) {
+    set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_8X8);
+    return;
+  }
+
+  if (vt->force_split || (!has_cols && !has_rows)) goto split;
 
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
-  if (bsize == bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+  if (vt->bsize == bsize_min[0]) {
+    if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
+    } else {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
+      set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+      if (vt->bsize > BLOCK_8X8) {
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
+      }
+      return;
     }
-    return 0;
-  } else if (bsize > bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
+  } else if (vt->bsize > bsize_min[0]) {
     // For key frame: take split for bsize above 32X32 or very high variance.
     if (cm->frame_type == KEY_FRAME &&
-        (bsize > BLOCK_32X32 ||
-         vt.part_variances->none.variance > (threshold << 4))) {
-      return 0;
+        (vt->bsize > BLOCK_32X32 ||
+         vt->variances.none.variance > (threshold[0] << 4))) {
+      goto split;
     }
     // If variance is low, take the bsize (no split).
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+    if (has_cols && has_rows && vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
     }
 
     // Check vertical split.
-    if (mi_row + block_height / 2 < cm->mi_rows) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-      get_variance(&vt.part_variances->vert[0]);
-      get_variance(&vt.part_variances->vert[1]);
-      if (vt.part_variances->vert[0].variance < threshold &&
-          vt.part_variances->vert[1].variance < threshold &&
+    if (has_rows) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
+      if (vt->variances.vert[0].variance < threshold[0] &&
+          vt->variances.vert[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        return;
       }
     }
     // Check horizontal split.
-    if (mi_col + block_width / 2 < cm->mi_cols) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-      get_variance(&vt.part_variances->horz[0]);
-      get_variance(&vt.part_variances->horz[1]);
-      if (vt.part_variances->horz[0].variance < threshold &&
-          vt.part_variances->horz[1].variance < threshold &&
+    if (has_cols) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
+      if (vt->variances.horz[0].variance < threshold[0] &&
+          vt->variances.horz[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        return;
       }
     }
-
-    return 0;
   }
-  return 0;
+
+split : {
+  set_vt_partitioning(cpi, x, xd, vt->split[0], mi_row, mi_col, threshold + 1,
+                      bsize_min + 1);
+  set_vt_partitioning(cpi, x, xd, vt->split[1], mi_row, mi_col + hbw,
+                      threshold + 1, bsize_min + 1);
+  set_vt_partitioning(cpi, x, xd, vt->split[2], mi_row + hbh, mi_col,
+                      threshold + 1, bsize_min + 1);
+  set_vt_partitioning(cpi, x, xd, vt->split[3], mi_row + hbh, mi_col + hbw,
+                      threshold + 1, bsize_min + 1);
+  return;
+}
 }
 
 // Set the variance split thresholds for following the block sizes:
@@ -464,23 +532,24 @@
   const int64_t threshold_base =
       (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
   if (is_key_frame) {
-    thresholds[0] = threshold_base;
-    thresholds[1] = threshold_base >> 2;
-    thresholds[2] = threshold_base >> 2;
-    thresholds[3] = threshold_base << 2;
-  } else {
     thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    thresholds[2] = threshold_base;
     if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[0] = threshold_base >> 2;
-      thresholds[2] = threshold_base << 3;
+      thresholds[1] = threshold_base >> 2;
+      thresholds[3] = threshold_base << 3;
     } else {
-      thresholds[0] = threshold_base;
-      thresholds[1] = (5 * threshold_base) >> 2;
+      thresholds[1] = threshold_base;
+      thresholds[2] = (5 * threshold_base) >> 2;
       if (cm->width >= 1920 && cm->height >= 1080)
-        thresholds[1] = (7 * threshold_base) >> 2;
-      thresholds[2] = threshold_base << cpi->oxcf.speed;
+        thresholds[2] = (7 * threshold_base) >> 2;
+      thresholds[3] = threshold_base << cpi->oxcf.speed;
     }
   }
+  thresholds[0] = INT64_MIN;
 }
 
 void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q) {
@@ -510,10 +579,10 @@
 }
 
 // Compute the minmax over the 8x8 subblocks.
-static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
-                              int dp, int x16_idx, int y16_idx,
+static int compute_minmax_8x8(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
 #if CONFIG_AOM_HIGHBITDEPTH
-                              int highbd_flag,
+                              int highbd,
 #endif
                               int pixels_wide, int pixels_high) {
   int k;
@@ -521,21 +590,23 @@
   int minmax_min = 255;
   // Loop over the 4 8x8 subblocks.
   for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+    const int x8_idx = ((k & 1) << 3);
+    const int y8_idx = ((k >> 1) << 3);
     int min = 0;
     int max = 0;
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      const int src_offset = y8_idx * src_stride + x8_idx;
+      const int ref_offset = y8_idx * ref_stride + x8_idx;
 #if CONFIG_AOM_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        aom_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                              d + y8_idx * dp + x8_idx, dp, &min, &max);
+      if (highbd) {
+        aom_highbd_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
+                              ref_stride, &min, &max);
       } else {
-        aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx,
-                       dp, &min, &max);
+        aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset,
+                       ref_stride, &min, &max);
       }
 #else
-      aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp,
+      aom_minmax_8x8(src + src_offset, src_stride, ref + ref_offset, ref_stride,
                      &min, &max);
 #endif
       if ((max - min) > minmax_max) minmax_max = (max - min);
@@ -545,107 +616,236 @@
   return (minmax_max - minmax_min);
 }
 
-static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
 #if CONFIG_AOM_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide, int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x4_idx = x8_idx + ((k & 1) << 2);
-    int y4_idx = y8_idx + ((k >> 1) << 2);
-    unsigned int sse = 0;
-    int sum = 0;
-    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_AOM_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = aom_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame)
-          d_avg = aom_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      } else {
-        s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      }
+static INLINE int avg_4x4(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return aom_highbd_avg_4x4(src, stride);
+  } else {
+    return aom_avg_4x4(src, stride);
+  }
+}
 #else
-      s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-      if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
+  return aom_avg_4x4(src, stride);
+}
 #endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
-    }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static INLINE int avg_8x8(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return aom_highbd_avg_8x8(src, stride);
+  } else {
+    return aom_avg_8x8(src, stride);
+  }
+}
+#else
+static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
+  return aom_avg_8x8(src, stride);
+}
+#endif
+
+static void init_variance_tree(VAR_TREE *const vt,
+#if CONFIG_AOM_HIGHBITDEPTH
+                               const int highbd,
+#endif
+                               BLOCK_SIZE bsize, BLOCK_SIZE leaf_size,
+                               const int width, const int height,
+                               const uint8_t *const src, const int src_stride,
+                               const uint8_t *const ref, const int ref_stride) {
+  assert(bsize >= leaf_size);
+
+  vt->bsize = bsize;
+
+  vt->force_split = 0;
+
+  vt->src = src;
+  vt->src_stride = src_stride;
+  vt->ref = ref;
+  vt->ref_stride = ref_stride;
+
+  vt->width = width;
+  vt->height = height;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  vt->highbd = highbd;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  if (bsize > leaf_size) {
+    const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    const int px = num_4x4_blocks_wide_lookup[subsize] * 4;
+
+    init_variance_tree(vt->split[0],
+#if CONFIG_AOM_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                       subsize, leaf_size, AOMMIN(px, width),
+                       AOMMIN(px, height), src, src_stride, ref, ref_stride);
+    init_variance_tree(vt->split[1],
+#if CONFIG_AOM_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                       subsize, leaf_size, width - px, AOMMIN(px, height),
+                       src + px, src_stride, ref + px, ref_stride);
+    init_variance_tree(vt->split[2],
+#if CONFIG_AOM_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                       subsize, leaf_size, AOMMIN(px, width), height - px,
+                       src + px * src_stride, src_stride, ref + px * ref_stride,
+                       ref_stride);
+    init_variance_tree(vt->split[3],
+#if CONFIG_AOM_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                       subsize, leaf_size, width - px, height - px,
+                       src + px * src_stride + px, src_stride,
+                       ref + px * ref_stride + px, ref_stride);
   }
 }
 
-static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
-#if CONFIG_AOM_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide, int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+// Fill the variance tree based on averaging pixel values (sub-sampling), at
+// the leaf node size.
+static void fill_variance_tree(VAR_TREE *const vt, const BLOCK_SIZE leaf_size) {
+  if (vt->bsize > leaf_size) {
+    fill_variance_tree(vt->split[0], leaf_size);
+    fill_variance_tree(vt->split[1], leaf_size);
+    fill_variance_tree(vt->split[2], leaf_size);
+    fill_variance_tree(vt->split[3], leaf_size);
+    fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
     unsigned int sse = 0;
     int sum = 0;
-    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_AOM_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = aom_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = aom_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      } else {
-        s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      }
-#else
-      s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-      if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-#endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
+    int src_avg;
+    int ref_avg;
+    assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
+    if (leaf_size == BLOCK_4X4) {
+      src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    } else {
+      src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
     }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+    sum = src_avg - ref_avg;
+    sse = sum * sum;
+    fill_variance(sse, sum, 0, &vt->variances.none);
   }
 }
 
+static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
+  if (vt->bsize >= BLOCK_8X8) {
+    if (vt->bsize == BLOCK_16X16) {
+      if (vt->variances.none.variance <= threshold)
+        return;
+      else
+        vt->force_split = 0;
+    }
+
+    refine_variance_tree(vt->split[0], threshold);
+    refine_variance_tree(vt->split[1], threshold);
+    refine_variance_tree(vt->split[2], threshold);
+    refine_variance_tree(vt->split[3], threshold);
+
+    if (vt->bsize <= BLOCK_16X16) fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
+    const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+    const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    const int sum = src_avg - ref_avg;
+    const unsigned int sse = sum * sum;
+    assert(vt->bsize == BLOCK_4X4);
+    fill_variance(sse, sum, 0, &vt->variances.none);
+  }
+}
+
+static int check_split_key_frame(VAR_TREE *const vt, const int64_t threshold) {
+  if (vt->bsize == BLOCK_32X32) {
+    vt->force_split = vt->variances.none.variance > threshold;
+  } else {
+    vt->force_split |= check_split_key_frame(vt->split[0], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[1], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[2], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[3], threshold);
+  }
+  return vt->force_split;
+}
+
+static int check_split(AV1_COMP *const cpi, VAR_TREE *const vt,
+                       const int segment_id, const int64_t *const thresholds) {
+  if (vt->bsize == BLOCK_16X16) {
+    vt->force_split = vt->variances.none.variance > thresholds[0];
+    if (!vt->force_split && vt->variances.none.variance > thresholds[-1] &&
+        !cyclic_refresh_segment_id_boosted(segment_id)) {
+      // We have some nominal amount of 16x16 variance (based on average),
+      // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+      // force split to 8x8 block for this 16x16 block.
+      int minmax =
+          compute_minmax_8x8(vt->src, vt->src_stride, vt->ref, vt->ref_stride,
+#if CONFIG_AOM_HIGHBITDEPTH
+                             vt->highbd,
+#endif
+                             vt->width, vt->height);
+      vt->force_split = minmax > cpi->vbp_threshold_minmax;
+    }
+  } else {
+    vt->force_split |=
+        check_split(cpi, vt->split[0], segment_id, thresholds + 1);
+    vt->force_split |=
+        check_split(cpi, vt->split[1], segment_id, thresholds + 1);
+    vt->force_split |=
+        check_split(cpi, vt->split[2], segment_id, thresholds + 1);
+    vt->force_split |=
+        check_split(cpi, vt->split[3], segment_id, thresholds + 1);
+
+    if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
+      vt->force_split = vt->variances.none.variance > thresholds[0];
+    }
+  }
+
+  return vt->force_split;
+}
+
 // This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for down-sampled inputs.
-static int choose_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                               MACROBLOCK *x, int mi_row, int mi_col) {
+// reconstructed last (or golden), where variance is computed for down-sampled
+// inputs.
+static void choose_partitioning(AV1_COMP *const cpi, ThreadData *const td,
+                                const TileInfo *const tile, MACROBLOCK *const x,
+                                const int mi_row, const int mi_col) {
   AV1_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int i, j, k, m;
-  v64x64 vt;
-  v16x16 vt2[16];
-  int force_split[21];
-  uint8_t *s;
-  const uint8_t *d;
-  int sp;
-  int dp;
-  int pixels_wide = 64, pixels_high = 64;
-  int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
-                            cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] };
+  MACROBLOCKD *const xd = &x->e_mbd;
+  VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+#if CONFIG_DUAL_FILTER
+  int i;
+#endif
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int pixels_wide = 8 * num_8x8_blocks_wide_lookup[cm->sb_size];
+  int pixels_high = 8 * num_8x8_blocks_high_lookup[cm->sb_size];
+  int64_t thresholds[5] = {
+    cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2],
+    cpi->vbp_thresholds[3], cpi->vbp_thresholds[4],
+  };
+  BLOCK_SIZE bsize_min[5] = { BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+                              cpi->vbp_bsize_min, BLOCK_8X8 };
+  const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
+  const int64_t *const thre = thresholds + start_level;
+  const BLOCK_SIZE *const bmin = bsize_min + start_level;
 
-  // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int use_4x4_partition = is_key_frame;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
-  int variance4x4downsample[16];
 
   int segment_id = CR_SEGMENT_ID_BASE;
+
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
     const uint8_t *const map =
         cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+    segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
 
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
@@ -653,25 +853,35 @@
     }
   }
 
-  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
 
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
 
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
+  src = x->plane[0].src.buf;
+  src_stride = x->plane[0].src.stride;
 
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
-    const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
-                             (mi_row + 4 < cm->mi_rows);
+
+    const int hbs = cm->mib_size / 2;
+    const int split_vert = mi_col + hbs >= cm->mi_cols;
+    const int split_horz = mi_row + hbs >= cm->mi_rows;
+    BLOCK_SIZE bsize;
+
+    if (split_vert && split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
+    else if (split_vert)
+      bsize = get_subsize(cm->sb_size, PARTITION_VERT);
+    else if (split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
+    else
+      bsize = cm->sb_size;
 
     assert(yv12 != NULL);
-    yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
     if (yv12_g && yv12_g != yv12) {
       av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -687,11 +897,16 @@
                          &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
     mbmi->ref_frame[1] = NONE;
-    mbmi->sb_type = BLOCK_64X64;
+    mbmi->sb_type = cm->sb_size;
     mbmi->mv[0].as_int = 0;
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = BILINEAR;
+#else
     mbmi->interp_filter = BILINEAR;
+#endif
 
     y_sad = av1_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
     if (y_sad_g < y_sad) {
       av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -702,190 +917,107 @@
       x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
     }
 
-    av1_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, cm->sb_size);
 
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
+    ref = xd->plane[0].dst.buf;
+    ref_stride = xd->plane[0].dst.stride;
 
-    // If the y_sad is very small, take 64x64 as partition and exit.
-    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+    // If the y_sad is very small, take the largest partition and exit.
+    // Don't check on boosted segment for now, as largest is suppressed there.
     if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
-      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
-      if (mi_col + block_width / 2 < cm->mi_cols &&
-          mi_row + block_height / 2 < cm->mi_rows) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
-        return 0;
+      if (!split_vert && !split_horz) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
+        return;
       }
     }
   } else {
-    d = AV1_VAR_OFFS;
-    dp = 0;
+    ref = AV1_VAR_OFFS;
+    ref_stride = 0;
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       switch (xd->bd) {
-        case 10: d = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10); break;
-        case 12: d = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12); break;
+        case 10: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10); break;
+        case 12: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12); break;
         case 8:
-        default: d = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8); break;
+        default: ref = CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8); break;
       }
     }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
   }
 
-  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
-  // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
-  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
-  // for splits.
-  for (i = 0; i < 4; i++) {
-    const int x32_idx = ((i & 1) << 5);
-    const int y32_idx = ((i >> 1) << 5);
-    const int i2 = i << 2;
-    force_split[i + 1] = 0;
-    for (j = 0; j < 4; j++) {
-      const int x16_idx = x32_idx + ((j & 1) << 4);
-      const int y16_idx = y32_idx + ((j >> 1) << 4);
-      const int split_index = 5 + i2 + j;
-      v16x16 *vst = &vt.split[i].split[j];
-      force_split[split_index] = 0;
-      variance4x4downsample[i2 + j] = 0;
-      if (!is_key_frame) {
-        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+  init_variance_tree(
+      vt,
 #if CONFIG_AOM_HIGHBITDEPTH
-                             xd->cur_buf->flags,
-#endif
-                             pixels_wide, pixels_high, is_key_frame);
-        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-        get_variance(&vt.split[i].split[j].part_variances.none);
-        if (vt.split[i].split[j].part_variances.none.variance > thresholds[2]) {
-          // 16X16 variance is above threshold for split, so force split to 8x8
-          // for this 16x16 block (this also forces splits for upper levels).
-          force_split[split_index] = 1;
-          force_split[i + 1] = 1;
-          force_split[0] = 1;
-        } else if (vt.split[i].split[j].part_variances.none.variance >
-                       thresholds[1] &&
-                   !cyclic_refresh_segment_id_boosted(segment_id)) {
-          // We have some nominal amount of 16x16 variance (based on average),
-          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
-          // force split to 8x8 block for this 16x16 block.
-          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
-#if CONFIG_AOM_HIGHBITDEPTH
-                                          xd->cur_buf->flags,
-#endif
-                                          pixels_wide, pixels_high);
-          if (minmax > cpi->vbp_threshold_minmax) {
-            force_split[split_index] = 1;
-            force_split[i + 1] = 1;
-            force_split[0] = 1;
-          }
-        }
-      }
-      if (is_key_frame || (low_res &&
-                           vt.split[i].split[j].part_variances.none.variance >
-                               (thresholds[1] << 1))) {
-        force_split[split_index] = 0;
-        // Go down to 4x4 down-sampling for variance.
-        variance4x4downsample[i2 + j] = 1;
-        for (k = 0; k < 4; k++) {
-          int x8_idx = x16_idx + ((k & 1) << 3);
-          int y8_idx = y16_idx + ((k >> 1) << 3);
-          v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
-          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
-#if CONFIG_AOM_HIGHBITDEPTH
-                               xd->cur_buf->flags,
-#endif
-                               pixels_wide, pixels_high, is_key_frame);
-        }
-      }
+      xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      cm->sb_size, (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
+      pixels_wide, pixels_high, src, src_stride, ref, ref_stride);
+
+  // Fill in the entire tree of variances and compute splits.
+  if (is_key_frame) {
+    fill_variance_tree(vt, BLOCK_4X4);
+    check_split_key_frame(vt, thre[1]);
+  } else {
+    fill_variance_tree(vt, BLOCK_8X8);
+    check_split(cpi, vt, segment_id, thre);
+    if (low_res) {
+      refine_variance_tree(vt, thre[1] << 1);
     }
   }
 
-  // Fill the rest of the variance tree by summing split partition values.
-  for (i = 0; i < 4; i++) {
-    const int i2 = i << 2;
-    for (j = 0; j < 4; j++) {
-      if (variance4x4downsample[i2 + j] == 1) {
-        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] : &vt.split[i].split[j];
-        for (m = 0; m < 4; m++) fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
-        fill_variance_tree(vtemp, BLOCK_16X16);
-      }
-    }
-    fill_variance_tree(&vt.split[i], BLOCK_32X32);
-    // If variance of this 32x32 block is above the threshold, force the block
-    // to split. This also forces a split on the upper (64x64) level.
-    if (!force_split[i + 1]) {
-      get_variance(&vt.split[i].part_variances.none);
-      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
-        force_split[i + 1] = 1;
-        force_split[0] = 1;
-      }
-    }
-  }
-  if (!force_split[0]) {
-    fill_variance_tree(&vt, BLOCK_64X64);
-    get_variance(&vt.part_variances.none);
-  }
+  vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
+                     mi_row + cm->mib_size > cm->mi_rows;
 
   // Now go through the entire structure, splitting every block size until
   // we get to one that's got a variance lower than our threshold.
-  if (mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
-      !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           thresholds[0], BLOCK_16X16, force_split[0])) {
-    for (i = 0; i < 4; ++i) {
-      const int x32_idx = ((i & 1) << 2);
-      const int y32_idx = ((i >> 1) << 2);
-      const int i2 = i << 2;
-      if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx),
-                               thresholds[1], BLOCK_16X16,
-                               force_split[i + 1])) {
-        for (j = 0; j < 4; ++j) {
-          const int x16_idx = ((j & 1) << 1);
-          const int y16_idx = ((j >> 1) << 1);
-          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
-          // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
-          v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1)
-                              ? &vt2[i2 + j]
-                              : &vt.split[i].split[j];
-          if (!set_vt_partitioning(
-                  cpi, x, xd, vtemp, BLOCK_16X16, mi_row + y32_idx + y16_idx,
-                  mi_col + x32_idx + x16_idx, thresholds[2], cpi->vbp_bsize_min,
-                  force_split[5 + i2 + j])) {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              if (use_4x4_partition) {
-                if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
-                                         BLOCK_8X8,
-                                         mi_row + y32_idx + y16_idx + y8_idx,
-                                         mi_col + x32_idx + x16_idx + x8_idx,
-                                         thresholds[3], BLOCK_8X8, 0)) {
-                  set_block_size(
-                      cpi, x, xd, (mi_row + y32_idx + y16_idx + y8_idx),
-                      (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_4X4);
-                }
-              } else {
-                set_block_size(
-                    cpi, x, xd, (mi_row + y32_idx + y16_idx + y8_idx),
-                    (mi_col + x32_idx + x16_idx + x8_idx), BLOCK_8X8);
-              }
-            }
-          }
-        }
-      }
+  set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
+}
+
+#if CONFIG_DUAL_FILTER
+static void reset_intmv_filter_type(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+                                    MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    if (!has_subpel_mv_component(xd->mi[0], xd, dir) &&
+        (mbmi->ref_frame[1] == NONE ||
+         !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
+      mbmi->interp_filter[dir] = (cm->interp_filter == SWITCHABLE)
+                                     ? EIGHTTAP_REGULAR
+                                     : cm->interp_filter;
+    mbmi->interp_filter[dir + 2] = mbmi->interp_filter[dir];
+  }
+}
+
+static void update_filter_type_count(FRAME_COUNTS *counts,
+                                     const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+      const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+      ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]];
     }
   }
-  return 0;
 }
+#endif
+#if CONFIG_GLOBAL_MOTION
+static void update_global_motion_used(PREDICTION_MODE mode,
+                                      const MB_MODE_INFO *mbmi, AV1_COMP *cpi) {
+  if (mode == ZEROMV) {
+    ++cpi->global_motion_used[mbmi->ref_frame[0]];
+    if (has_second_ref(mbmi)) ++cpi->global_motion_used[mbmi->ref_frame[1]];
+  }
+}
+#endif  // CONFIG_GLOBAL_MOTION
 
 static void update_state(const AV1_COMP *const cpi, ThreadData *td,
                          PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                         BLOCK_SIZE bsize) {
+                         BLOCK_SIZE bsize, RUN_TYPE dry_run) {
   int i, x_idx, y;
   const AV1_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = x->plane;
@@ -894,19 +1026,32 @@
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
 
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
 #if CONFIG_REF_MV
   int8_t rf_type;
 #endif
 
+#if !CONFIG_SUPERTX
   assert(mi->mbmi.sb_type == bsize);
+#endif
 
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
 
+#if CONFIG_DUAL_FILTER
+  reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
 #if CONFIG_REF_MV
   rf_type = av1_ref_frame_type(mbmi->ref_frame);
   if (x->mbmi_ext->ref_mv_count[rf_type] > 1 && mbmi->sb_type >= BLOCK_8X8 &&
@@ -917,7 +1062,6 @@
               ? x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].this_mv
               : x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].comp_mv;
       clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
-      lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
       x->mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
       mbmi->pred_mv[i] = this_mv;
       mi->mbmi.pred_mv[i] = this_mv;
@@ -950,7 +1094,6 @@
 #endif
     p[i].eobs = ctx->eobs[i];
   }
-
 #if CONFIG_PALETTE
   for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
 #endif  // CONFIG_PALETTE
@@ -966,9 +1109,10 @@
 
 #if CONFIG_DELTA_Q
   if (cpi->oxcf.aq_mode > NO_AQ && cpi->oxcf.aq_mode < DELTA_AQ)
-    av1_init_plane_quantizers(cpi, x);
+    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
 #else
-  if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x);
+  if (cpi->oxcf.aq_mode)
+    av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
 #endif
 
   if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
@@ -977,11 +1121,467 @@
   }
 
   x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+  for (i = 0; i < 1; ++i)
+    memcpy(x->blk_skip[i], ctx->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+
+  if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+  {
+    unsigned int *const mode_chosen_counts =
+        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
+    if (frame_is_intra_only(cm)) {
+      static const int kf_mode_index[] = {
+        THR_DC /*DC_PRED*/,          THR_V_PRED /*V_PRED*/,
+        THR_H_PRED /*H_PRED*/,       THR_D45_PRED /*D45_PRED*/,
+        THR_D135_PRED /*D135_PRED*/, THR_D117_PRED /*D117_PRED*/,
+        THR_D153_PRED /*D153_PRED*/, THR_D207_PRED /*D207_PRED*/,
+        THR_D63_PRED /*D63_PRED*/,   THR_TM /*TM_PRED*/,
+      };
+      ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
+    } else {
+      // Note how often each mode chosen as best
+      ++mode_chosen_counts[ctx->best_mode_index];
+    }
+  }
+#endif
+  if (!frame_is_intra_only(cm)) {
+    if (is_inter_block(mbmi)) {
+      av1_update_mv_count(td);
+#if CONFIG_GLOBAL_MOTION
+      if (bsize >= BLOCK_8X8) {
+        // TODO(sarahparker): global motion stats need to be handled per-tile
+        // to be compatible with tile-based threading.
+        update_global_motion_used(mbmi->mode, mbmi, (AV1_COMP *)cpi);
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            update_global_motion_used(mi->bmi[j].as_mode, mbmi,
+                                      (AV1_COMP *)cpi);
+          }
+        }
+      }
+#endif  // CONFIG_GLOBAL_MOTION
+      if (cm->interp_filter == SWITCHABLE
+#if CONFIG_EXT_INTERP
+          && av1_is_interp_needed(xd)
+#endif
+              ) {
+#if CONFIG_DUAL_FILTER
+        update_filter_type_count(td->counts, xd, mbmi);
+#else
+        const int switchable_ctx = av1_get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[switchable_ctx][mbmi->interp_filter];
+#endif
+      }
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
 }
 
+#if CONFIG_SUPERTX
+static void update_state_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                 PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                                 BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+  int y, x_idx;
+#if CONFIG_VAR_TX || CONFIG_REF_MV
+  int i;
+#endif
+  const AV1_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = AOMMIN(mi_width, cm->mi_cols - mi_col);
+  const int y_mis = AOMMIN(mi_height, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+#if CONFIG_REF_MV
+  int8_t rf_type;
+#endif
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+  assert(is_inter_block(mbmi));
+  assert(mbmi->tx_size == ctx->mic.mbmi.tx_size);
+
+#if CONFIG_DUAL_FILTER
+  reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
+#if CONFIG_REF_MV
+  rf_type = av1_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 && mbmi->sb_type >= BLOCK_8X8 &&
+      mbmi->mode == NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      int_mv this_mv =
+          (i == 0)
+              ? x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].this_mv
+              : x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].comp_mv;
+      clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+      lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
+      x->mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
+      mbmi->pred_mv[i] = this_mv;
+    }
+  }
+#endif
+
+  // If segmentation in use
+  if (seg->enabled) {
+    if (cpi->vaq_refresh) {
+      const int energy =
+          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
+      mi_addr->mbmi.segment_id = av1_vaq_segment_id(energy);
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      // For cyclic refresh mode, now update the segment map
+      // and set the segment id.
+      av1_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row, mi_col,
+                                        bsize, ctx->rate, ctx->dist, 1);
+    } else {
+      // Otherwise just set the segment id based on the current segment map
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
+  }
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+          (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  }
+
+  x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+  for (i = 0; i < 1; ++i)
+    memcpy(x->blk_skip[i], ctx->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+
+  if (!is_inter_block(mbmi) || mbmi->skip)
+    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_VAR_TX
+  {
+    const TX_SIZE mtx = mbmi->tx_size;
+    const int num_4x4_blocks_wide = tx_size_wide_unit[mtx] >> 1;
+    const int num_4x4_blocks_high = tx_size_high_unit[mtx] >> 1;
+    int idy, idx;
+    mbmi->inter_tx_size[0][0] = mtx;
+    for (idy = 0; idy < num_4x4_blocks_high; ++idy)
+      for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mtx;
+  }
+#endif  // CONFIG_VAR_TX
+  // Turn motion variation off for supertx
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+  if (dry_run) return;
+
+  if (!frame_is_intra_only(cm)) {
+    av1_update_mv_count(td);
+
+    if (cm->interp_filter == SWITCHABLE
+#if CONFIG_EXT_INTERP
+        && av1_is_interp_needed(xd)
+#endif
+            ) {
+#if CONFIG_DUAL_FILTER
+      update_filter_type_count(td->counts, xd, mbmi);
+#else
+      const int pred_ctx = av1_get_pred_context_switchable_interp(xd);
+      ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
+#endif
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
+}
+
+static void update_state_sb_supertx(const AV1_COMP *const cpi, ThreadData *td,
+                                    const TileInfo *const tile, int mi_row,
+                                    int mi_col, BLOCK_SIZE bsize,
+                                    RUN_TYPE dry_run, PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  int i;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  PICK_MODE_CONTEXT *pmc = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = av1_block_energy(cpi, x, bsize);
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col, subsize,
+                           dry_run);
+      break;
+    case PARTITION_VERT:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
+                           subsize, dry_run);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_supertx(cpi, td, &pc_tree->vertical[1], mi_row,
+                             mi_col + hbs, subsize, dry_run);
+      }
+      pmc = &pc_tree->vertical_supertx;
+      break;
+    case PARTITION_HORZ:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
+                           subsize, dry_run);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
+                             mi_col, subsize, dry_run);
+      }
+      pmc = &pc_tree->horizontal_supertx;
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
+                             subsize, dry_run);
+      } else {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize, dry_run,
+                                pc_tree->split[0]);
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
+                                dry_run, pc_tree->split[1]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
+                                dry_run, pc_tree->split[2]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                                subsize, dry_run, pc_tree->split[3]);
+      }
+      pmc = &pc_tree->split_supertx;
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
+                           bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
+                           mi_col + hbs, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
+                           mi_col, subsize, dry_run);
+      pmc = &pc_tree->horizontala_supertx;
+      break;
+    case PARTITION_HORZ_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
+                           subsize, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
+                           mi_col, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, dry_run);
+      pmc = &pc_tree->horizontalb_supertx;
+      break;
+    case PARTITION_VERT_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
+                           bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
+                           mi_col, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
+                           mi_col + hbs, subsize, dry_run);
+      pmc = &pc_tree->verticala_supertx;
+      break;
+    case PARTITION_VERT_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
+                           subsize, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
+                           mi_col + hbs, bsize2, dry_run);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, dry_run);
+      pmc = &pc_tree->verticalb_supertx;
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    if (pmc != NULL) {
+      p[i].coeff = pmc->coeff[i];
+      p[i].qcoeff = pmc->qcoeff[i];
+      pd[i].dqcoeff = pmc->dqcoeff[i];
+      p[i].eobs = pmc->eobs[i];
+    } else {
+      // These should never be used
+      p[i].coeff = NULL;
+      p[i].qcoeff = NULL;
+      pd[i].dqcoeff = NULL;
+      p[i].eobs = NULL;
+    }
+  }
+}
+
+static void update_supertx_param(ThreadData *td, PICK_MODE_CONTEXT *ctx,
+                                 int best_tx, TX_SIZE supertx_size) {
+  MACROBLOCK *const x = &td->mb;
+#if CONFIG_VAR_TX
+  int i;
+
+  for (i = 0; i < 1; ++i)
+    memcpy(ctx->blk_skip[i], x->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+  ctx->mic.mbmi.min_tx_size = get_min_tx_size(supertx_size);
+#endif  // CONFIG_VAR_TX
+  ctx->mic.mbmi.tx_size = supertx_size;
+  ctx->skip = x->skip;
+  ctx->mic.mbmi.tx_type = best_tx;
+}
+
+static void update_supertx_param_sb(const AV1_COMP *const cpi, ThreadData *td,
+                                    int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                    int best_tx, TX_SIZE supertx_size,
+                                    PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      update_supertx_param(td, &pc_tree->none, best_tx, supertx_size);
+      break;
+    case PARTITION_VERT:
+      update_supertx_param(td, &pc_tree->vertical[0], best_tx, supertx_size);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8)
+        update_supertx_param(td, &pc_tree->vertical[1], best_tx, supertx_size);
+      break;
+    case PARTITION_HORZ:
+      update_supertx_param(td, &pc_tree->horizontal[0], best_tx, supertx_size);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8)
+        update_supertx_param(td, &pc_tree->horizontal[1], best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        update_supertx_param(td, pc_tree->leaf_split[0], best_tx, supertx_size);
+      } else {
+        update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize, best_tx,
+                                supertx_size, pc_tree->split[0]);
+        update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize, best_tx,
+                                supertx_size, pc_tree->split[1]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize, best_tx,
+                                supertx_size, pc_tree->split[2]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize,
+                                best_tx, supertx_size, pc_tree->split[3]);
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_HORZ_B:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontalb[i], best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_VERT_A:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticala[i], best_tx, supertx_size);
+      break;
+    case PARTITION_VERT_B:
+      for (i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticalb[i], best_tx, supertx_size);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
   uint8_t *const buffers[3] = { src->y_buffer, src->u_buffer, src->v_buffer };
+  const int widths[3] = { src->y_crop_width, src->uv_crop_width,
+                          src->uv_crop_width };
+  const int heights[3] = { src->y_crop_height, src->uv_crop_height,
+                           src->uv_crop_height };
   const int strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
   int i;
 
@@ -989,8 +1589,9 @@
   x->e_mbd.cur_buf = src;
 
   for (i = 0; i < MAX_MB_PLANE; i++)
-    setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mi_row, mi_col,
-                     NULL, x->e_mbd.plane[i].subsampling_x,
+    setup_pred_plane(&x->plane[i].src, buffers[i], widths[i], heights[i],
+                     strides[i], mi_row, mi_col, NULL,
+                     x->e_mbd.plane[i].subsampling_x,
                      x->e_mbd.plane[i].subsampling_y);
 }
 
@@ -998,7 +1599,7 @@
                               int8_t segment_id) {
   int segment_qindex;
   const AV1_COMMON *const cm = &cpi->common;
-  av1_init_plane_quantizers(cpi, x);
+  av1_init_plane_quantizers(cpi, x, segment_id);
   aom_clear_system_state();
   segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
@@ -1006,8 +1607,15 @@
 
 static void rd_pick_sb_modes(const AV1_COMP *const cpi, TileDataEnc *tile_data,
                              MACROBLOCK *const x, int mi_row, int mi_col,
-                             RD_COST *rd_cost, BLOCK_SIZE bsize,
-                             PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+                             RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                             int *totalrate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                             PARTITION_TYPE partition,
+#endif
+                             BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                             int64_t best_rd) {
   const AV1_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1030,6 +1638,21 @@
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
+#if CONFIG_RD_DEBUG
+  mbmi->mi_row = mi_row;
+  mbmi->mi_col = mi_col;
+#endif
+#if CONFIG_SUPERTX
+  // We set tx_size here as skip blocks would otherwise not set it.
+  // tx_size needs to be set at this point as supertx_enable in
+  // write_modes_sb is computed based on this, and if the garbage in memory
+  // just happens to be the supertx_size, then the packer will code this
+  // block as a supertx block, even if rdopt did not pick it as such.
+  mbmi->tx_size = max_txsize_lookup[bsize];
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  mbmi->partition = partition;
+#endif
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff[i];
@@ -1068,25 +1691,19 @@
   orig_rdmult = x->rdmult;
 
   if (aq_mode == VARIANCE_AQ) {
-    const int energy =
-        bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
-    if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    if (cpi->vaq_refresh) {
+      const int energy =
+          bsize <= BLOCK_16X16 ? x->mb_energy : av1_block_energy(cpi, x, bsize);
       mbmi->segment_id = av1_vaq_segment_id(energy);
-    } else {
-      const uint8_t *const map =
-          cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      // Re-initialise quantiser
+      av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
     }
     x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == COMPLEXITY_AQ) {
     x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-    const uint8_t *const map =
-        cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
     // If segment is boosted, use rdmult for that segment.
-    if (cyclic_refresh_segment_id_boosted(
-            get_segment_id(cm, map, bsize, mi_row, mi_col)))
+    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
       x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
@@ -1094,17 +1711,42 @@
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
     av1_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+    *totalrate_nocoef = 0;
+#endif  // CONFIG_SUPERTX
   } else {
     if (bsize >= BLOCK_8X8) {
-      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
         av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
-      else
+#if CONFIG_SUPERTX
+        *totalrate_nocoef = rd_cost->rate;
+#endif  // CONFIG_SUPERTX
+      } else {
         av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+#if CONFIG_SUPERTX
+                                  totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
                                   bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+      }
     } else {
-      av1_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
-                                    bsize, ctx, best_rd);
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        // The decoder rejects sub8x8 partitions when SEG_LVL_SKIP is set.
+        rd_cost->rate = INT_MAX;
+      } else {
+        av1_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                      rd_cost,
+#if CONFIG_SUPERTX
+                                      totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                      bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+      }
     }
   }
 
@@ -1128,15 +1770,25 @@
 
 #if CONFIG_REF_MV
 static void update_inter_mode_stats(FRAME_COUNTS *counts, PREDICTION_MODE mode,
-                                    const int16_t mode_context) {
+#if CONFIG_EXT_INTER
+                                    int is_compound,
+#endif  // CONFIG_EXT_INTER
+                                    int16_t mode_context) {
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+#if CONFIG_EXT_INTER
+  if (mode == NEWMV || mode == NEWFROMNEARMV) {
+    if (!is_compound) ++counts->new2mv_mode[mode == NEWFROMNEARMV];
+#else
   if (mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
     ++counts->newmv_mode[mode_ctx][0];
     return;
   } else {
     ++counts->newmv_mode[mode_ctx][1];
 
-    if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) return;
+    if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      return;
+    }
 
     mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
     if (mode == ZEROMV) {
@@ -1156,46 +1808,13 @@
 }
 #endif
 
-static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi,
-                            const MODE_INFO *above_mi, const MODE_INFO *left_mi,
-                            const int intraonly) {
-  const PREDICTION_MODE y_mode = mi->mbmi.mode;
-  const PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-
-  if (bsize < BLOCK_8X8) {
-    int idx, idy;
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
-    for (idy = 0; idy < 2; idy += num_4x4_h)
-      for (idx = 0; idx < 2; idx += num_4x4_w) {
-        const int bidx = idy * 2 + idx;
-        const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
-        if (intraonly) {
-          const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx);
-          const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx);
-          ++counts->kf_y_mode[a][l][bmode];
-        } else {
-          ++counts->y_mode[0][bmode];
-        }
-      }
-  } else {
-    if (intraonly) {
-      const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
-      const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
-      ++counts->kf_y_mode[above][left][y_mode];
-    } else {
-      ++counts->y_mode[size_group_lookup[bsize]][y_mode];
-    }
-  }
-
-  ++counts->uv_mode[y_mode][uv_mode];
-}
-
-static void update_stats(const AV1_COMP *const cpi, ThreadData *td,
-                         PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                         BLOCK_SIZE bsize) {
-  const AV1_COMMON *const cm = &cpi->common;
+static void update_stats(const AV1_COMMON *const cm, ThreadData *td, int mi_row,
+                         int mi_col
+#if CONFIG_SUPERTX
+                         ,
+                         int supertx_enabled
+#endif
+                         ) {
 #if CONFIG_DELTA_Q
   MACROBLOCK *x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1203,36 +1822,10 @@
   const MACROBLOCK *x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
 #endif
-  MODE_INFO **mi_8x8 = xd->mi;
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-  RD_COUNTS *const rdc = &td->rd_counts;
-  const int seg_skip =
-      segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  const int mis = cm->mi_stride;
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-
-#if CONFIG_INTERNAL_STATS
-  {
-    unsigned int *const mode_chosen_counts =
-        (unsigned int *)cpi->mode_chosen_counts;  // Cast const away.
-    if (frame_is_intra_only(cm)) {
-      static const int kf_mode_index[] = {
-        THR_DC /*DC_PRED*/,          THR_V_PRED /*V_PRED*/,
-        THR_H_PRED /*H_PRED*/,       THR_D45_PRED /*D45_PRED*/,
-        THR_D135_PRED /*D135_PRED*/, THR_D117_PRED /*D117_PRED*/,
-        THR_D153_PRED /*D153_PRED*/, THR_D207_PRED /*D207_PRED*/,
-        THR_D63_PRED /*D63_PRED*/,   THR_TM /*TM_PRED*/,
-      };
-      ++mode_chosen_counts[kf_mode_index[mbmi->mode]];
-    } else {
-      // Note how often each mode chosen as best
-      ++mode_chosen_counts[ctx->best_mode_index];
-    }
-  }
-#endif
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
 #if CONFIG_DELTA_Q
   // delta quant applies to both intra and inter
@@ -1249,75 +1842,20 @@
     if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
     xd->prev_qindex = mbmi->current_q_index;
   }
+#else
+  (void)mi_row;
+  (void)mi_col;
 #endif
-
-  if (!is_inter_block(mbmi))
-    sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
-                    frame_is_intra_only(cm));
-
-  if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
-      !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
-    ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
-                    &td->counts->tx)[mbmi->tx_size];
-  } else {
-    int i, j;
-    TX_SIZE tx_size;
-    // The new intra coding scheme requires no change of transform size
-    if (is_inter_block(&mi->mbmi)) {
-      tx_size = AOMMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
-                       max_txsize_lookup[bsize]);
-    } else {
-      tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
-    }
-
-    for (j = 0; j < mi_height; j++)
-      for (i = 0; i < mi_width; i++)
-        if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
-          mi_8x8[mis * j + i]->mbmi.tx_size = tx_size;
-  }
-  ++td->counts->tx.tx_totals[mbmi->tx_size];
-  ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
-  if (mbmi->tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
-      !seg_skip) {
-    if (is_inter_block(mbmi)) {
-      ++td->counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
-    } else {
-      ++td->counts->intra_ext_tx[mbmi->tx_size]
-                                [intra_mode_to_tx_type_context[mbmi->mode]]
-                                [mbmi->tx_type];
-    }
-  }
-
   if (!frame_is_intra_only(cm)) {
     FRAME_COUNTS *const counts = td->counts;
     const int inter_block = is_inter_block(mbmi);
     const int seg_ref_active =
         segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
-
-    if (is_inter_block(mbmi)) {
-      av1_update_mv_count(td);
-
-      if (cm->interp_filter == SWITCHABLE) {
-#if CONFIG_EXT_INTERP
-        if (is_interp_needed(xd))
-#endif
-        {
-          const int switchable_ctx = av1_get_pred_context_switchable_interp(xd);
-          ++td->counts->switchable_interp[switchable_ctx][mbmi->interp_filter];
-        }
-      }
-
-#if CONFIG_MOTION_VAR
-      if (is_motion_variation_allowed(mbmi))
-        ++td->counts->motion_mode[bsize][mbmi->motion_mode];
-#endif  // CONFIG_MOTION_VAR
-    }
-    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
-    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
-    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-
     if (!seg_ref_active) {
-      counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#if CONFIG_SUPERTX
+      if (!supertx_enabled)
+#endif
+        counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
@@ -1335,18 +1873,19 @@
 #if CONFIG_EXT_REFS
           const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
 
-          counts->comp_fwdref[av1_get_pred_context_comp_fwdref_p(cm, xd)][0]
-                             [bit]++;
-          if (!bit)
-            counts->comp_fwdref[av1_get_pred_context_comp_fwdref_p1(cm, xd)][1]
-                               [ref0 == LAST_FRAME]++;
-          else
-            counts->comp_fwdref[av1_get_pred_context_comp_fwdref_p2(cm, xd)][2]
-                               [ref0 == GOLDEN_FRAME]++;
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+          if (!bit) {
+            counts->comp_ref[av1_get_pred_context_comp_ref_p1(cm, xd)][1]
+                            [ref0 == LAST_FRAME]++;
+          } else {
+            counts->comp_ref[av1_get_pred_context_comp_ref_p2(cm, xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+          }
+
           counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(cm, xd)][0]
                              [ref1 == ALTREF_FRAME]++;
 #else
-          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)]
+          counts->comp_ref[av1_get_pred_context_comp_ref_p(cm, xd)][0]
                           [ref0 == GOLDEN_FRAME]++;
 #endif  // CONFIG_EXT_REFS
         } else {
@@ -1372,53 +1911,115 @@
 #else
           counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
-          if (ref0 != LAST_FRAME)
+          if (ref0 != LAST_FRAME) {
             counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
                               [ref0 != GOLDEN_FRAME]++;
+          }
 #endif  // CONFIG_EXT_REFS
         }
+
+#if CONFIG_EXT_INTER
+        if (cm->reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+            !supertx_enabled &&
+#endif
+            is_interintra_allowed(mbmi)) {
+          const int bsize_group = size_group_lookup[bsize];
+          if (mbmi->ref_frame[1] == INTRA_FRAME) {
+            counts->interintra[bsize_group][1]++;
+            counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+            if (is_interintra_wedge_used(bsize))
+              counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+          } else {
+            counts->interintra[bsize_group][0]++;
+          }
+        }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#if CONFIG_SUPERTX
+        if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+          if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+            if (is_motion_variation_allowed(mbmi))
+              counts->motion_mode[mbmi->sb_type][mbmi->motion_mode]++;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+        if (cm->reference_mode != SINGLE_REFERENCE &&
+            is_inter_compound_mode(mbmi->mode) &&
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            !(is_motion_variation_allowed(mbmi) &&
+              mbmi->motion_mode != SIMPLE_TRANSLATION) &&
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+            is_interinter_wedge_used(bsize)) {
+          counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
+        }
+#endif  // CONFIG_EXT_INTER
       }
     }
 
-    if (inter_block && !seg_skip) {
+    if (inter_block &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
       int16_t mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
       if (bsize >= BLOCK_8X8) {
         const PREDICTION_MODE mode = mbmi->mode;
 #if CONFIG_REF_MV
-        mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                             mbmi->ref_frame, bsize, -1);
-        update_inter_mode_stats(counts, mode, mode_ctx);
-        if (mode == NEWMV) {
-          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-          int idx;
+#if CONFIG_EXT_INTER
+        if (has_second_ref(mbmi)) {
+          mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+        } else {
+#endif  // CONFIG_EXT_INTER
+          mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                               mbmi->ref_frame, bsize, -1);
+          update_inter_mode_stats(counts, mode,
+#if CONFIG_EXT_INTER
+                                  has_second_ref(mbmi),
+#endif  // CONFIG_EXT_INTER
+                                  mode_ctx);
 
-          for (idx = 0; idx < 2; ++idx) {
-            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-              uint8_t drl_ctx =
-                  av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+          if (mode == NEWMV) {
+            uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+            int idx;
 
-              if (mbmi->ref_mv_idx == idx) break;
+            for (idx = 0; idx < 2; ++idx) {
+              if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+                uint8_t drl_ctx =
+                    av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+                ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+
+                if (mbmi->ref_mv_idx == idx) break;
+              }
             }
           }
-        }
 
-        if (mode == NEARMV) {
-          uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-          int idx;
+          if (mode == NEARMV) {
+            uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+            int idx;
 
-          for (idx = 1; idx < 3; ++idx) {
-            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
-              uint8_t drl_ctx =
-                  av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
-              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+            for (idx = 1; idx < 3; ++idx) {
+              if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+                uint8_t drl_ctx =
+                    av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+                ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
 
-              if (mbmi->ref_mv_idx == idx - 1) break;
+                if (mbmi->ref_mv_idx == idx - 1) break;
+              }
             }
           }
+#if CONFIG_EXT_INTER
         }
+#endif  // CONFIG_EXT_INTER
 #else
-        ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+#if CONFIG_EXT_INTER
+        if (is_inter_compound_mode(mode))
+          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+        else
+#endif  // CONFIG_EXT_INTER
+          ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
 #endif
       } else {
         const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -1429,11 +2030,31 @@
             const int j = idy * 2 + idx;
             const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
 #if CONFIG_REF_MV
-            mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
-                                                 mbmi->ref_frame, bsize, j);
-            update_inter_mode_stats(counts, b_mode, mode_ctx);
+#if CONFIG_EXT_INTER
+            if (has_second_ref(mbmi)) {
+              mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+              ++counts->inter_compound_mode[mode_ctx]
+                                           [INTER_COMPOUND_OFFSET(b_mode)];
+            } else {
+#endif  // CONFIG_EXT_INTER
+              mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                                   mbmi->ref_frame, bsize, j);
+              update_inter_mode_stats(counts, b_mode,
+#if CONFIG_EXT_INTER
+                                      has_second_ref(mbmi),
+#endif  // CONFIG_EXT_INTER
+                                      mode_ctx);
+#if CONFIG_EXT_INTER
+            }
+#endif  // CONFIG_EXT_INTER
 #else
-            ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+#if CONFIG_EXT_INTER
+            if (is_inter_compound_mode(b_mode))
+              ++counts->inter_compound_mode[mode_ctx]
+                                           [INTER_COMPOUND_OFFSET(b_mode)];
+            else
+#endif  // CONFIG_EXT_INTER
+              ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
 #endif
           }
         }
@@ -1442,15 +2063,27 @@
   }
 }
 
-static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
-                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+typedef struct {
+  ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+#endif
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+static void restore_context(MACROBLOCK *x,
+                            const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
+                            int mi_col,
 #if CONFIG_PVQ
                             od_rollback_buffer *rdo_buf,
 #endif
                             BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -1458,33 +2091,39 @@
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
     memcpy(xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
-           a + num_4x4_blocks_wide * p,
+           ctx->a + num_4x4_blocks_wide * p,
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
                xd->plane[p].subsampling_x);
     memcpy(xd->left_context[p] +
                ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
-           l + num_4x4_blocks_high * p,
+           ctx->l + num_4x4_blocks_high * p,
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
                xd->plane[p].subsampling_y);
   }
-  memcpy(xd->above_seg_context + mi_col, sa,
+  memcpy(xd->above_seg_context + mi_col, ctx->sa,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), sl,
+  memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+#endif
 #if CONFIG_PVQ
   od_encode_rollback(&x->daala_enc, rdo_buf);
 #endif
 }
 
-static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
-                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col,
 #if CONFIG_PVQ
                          od_rollback_buffer *rdo_buf,
 #endif
                          BLOCK_SIZE bsize) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -1493,20 +2132,28 @@
 
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
-    memcpy(a + num_4x4_blocks_wide * p,
+    memcpy(ctx->a + num_4x4_blocks_wide * p,
            xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
                xd->plane[p].subsampling_x);
-    memcpy(l + num_4x4_blocks_high * p,
+    memcpy(ctx->l + num_4x4_blocks_high * p,
            xd->left_context[p] +
                ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
            (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
                xd->plane[p].subsampling_y);
   }
-  memcpy(sa, xd->above_seg_context + mi_col,
+  memcpy(ctx->sa, xd->above_seg_context + mi_col,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
+  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+#endif
 #if CONFIG_PVQ
   od_encode_checkpoint(&x->daala_enc, rdo_buf);
 #endif
@@ -1514,86 +2161,241 @@
 
 static void encode_b(const AV1_COMP *const cpi, const TileInfo *const tile,
                      ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize,
-                     PICK_MODE_CONTEXT *ctx) {
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_TYPE partition,
+#endif
+                     PICK_MODE_CONTEXT *ctx, int *rate) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
-  update_state(cpi, td, ctx, mi_row, mi_col, bsize);
-  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+#if CONFIG_EXT_PARTITION_TYPES
+  x->e_mbd.mi[0]->mbmi.partition = partition;
+#endif
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, dry_run);
+  encode_superblock(cpi, td, tp, dry_run, mi_row, mi_col, bsize, ctx, rate);
+
+  if (!dry_run) {
+#if CONFIG_SUPERTX
+    update_stats(&cpi->common, td, mi_row, mi_col, 0);
+#else
+    update_stats(&cpi->common, td, mi_row, mi_col);
+#endif
+  }
 }
 
 static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
                       const TileInfo *const tile, TOKENEXTRA **tp, int mi_row,
-                      int mi_col, int output_enabled, BLOCK_SIZE bsize,
-                      PC_TREE *pc_tree) {
+                      int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                      PC_TREE *pc_tree, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  int ctx;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize = bsize;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+  assert(bsize >= BLOCK_8X8);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
-  if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  } else {
-    ctx = 0;
-    subsize = BLOCK_4X4;
-  }
+  if (!dry_run) td->counts->partition[ctx][partition]++;
 
-  partition = partition_lookup[bsl][subsize];
-  if (output_enabled && bsize != BLOCK_4X4)
-    td->counts->partition[ctx][partition]++;
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      partition != PARTITION_NONE && !xd->lossless[0]) {
+    int supertx_enabled;
+    TX_SIZE supertx_size = max_txsize_lookup[bsize];
+    supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree);
+    if (supertx_enabled) {
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      int x_idx, y_idx, i;
+      uint8_t *dst_buf[3];
+      int dst_stride[3];
+      set_skip_context(xd, mi_row, mi_col);
+      set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, dry_run,
+                              pc_tree);
+
+      av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        dst_buf[i] = xd->plane[i].dst.buf;
+        dst_stride[i] = xd->plane[i].dst.stride;
+      }
+      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, dry_run,
+                         bsize, bsize, dst_buf, dst_stride, pc_tree);
+
+      set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+      set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+      if (!x->skip) {
+        int this_rate = 0;
+        x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
+
+        av1_encode_sb_supertx((AV1_COMMON *)cm, x, bsize);
+        av1_tokenize_sb_supertx(cpi, td, tp, dry_run, bsize, rate);
+        if (rate) *rate += this_rate;
+      } else {
+        xd->mi[0]->mbmi.skip = 1;
+        if (!dry_run) td->counts->skip[av1_get_skip_context(xd)][1]++;
+        reset_skip_context(xd, bsize);
+      }
+      if (!dry_run) {
+        for (y_idx = 0; y_idx < mi_height; y_idx++)
+          for (x_idx = 0; x_idx < mi_width; x_idx++) {
+            if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width >
+                    x_idx &&
+                (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height >
+                    y_idx) {
+              xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip =
+                  xd->mi[0]->mbmi.skip;
+            }
+          }
+        td->counts->supertx[partition_supertx_context_lookup[partition]]
+                           [supertx_size][1]++;
+        td->counts->supertx_size[supertx_size]++;
+#if CONFIG_EXT_TX
+        if (get_ext_tx_types(supertx_size, bsize, 1) > 1 &&
+            !xd->mi[0]->mbmi.skip) {
+          int eset = get_ext_tx_set(supertx_size, bsize, 1);
+          if (eset > 0) {
+            ++td->counts->inter_ext_tx[eset][supertx_size]
+                                      [xd->mi[0]->mbmi.tx_type];
+          }
+        }
+#else
+        if (supertx_size < TX_32X32 && !xd->mi[0]->mbmi.skip) {
+          ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type];
+        }
+#endif  // CONFIG_EXT_TX
+      }
+#if CONFIG_EXT_PARTITION_TYPES
+      update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
+                                   partition);
+#else
+      if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif
+#if CONFIG_VAR_TX
+      set_txfm_ctxs(supertx_size, mi_width, mi_height, xd);
+#endif  // CONFIG_VAR_TX
+      return;
+    } else {
+      if (!dry_run) {
+        td->counts->supertx[partition_supertx_context_lookup[partition]]
+                           [supertx_size][0]++;
+      }
+    }
+  }
+#endif  // CONFIG_SUPERTX
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               &pc_tree->none);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
+               &pc_tree->none, rate);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               &pc_tree->vertical[0]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
+               &pc_tree->vertical[0], rate);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
-                 subsize, &pc_tree->vertical[1]);
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->vertical[1], rate);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-               &pc_tree->horizontal[0]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
+               &pc_tree->horizontal[0], rate);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
-                 subsize, &pc_tree->horizontal[1]);
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->horizontal[1], rate);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
-                 pc_tree->leaf_split[0]);
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 pc_tree->leaf_split[0], rate);
       } else {
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
-                  pc_tree->split[0]);
-        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[1]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
-                  subsize, pc_tree->split[2]);
-        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                  subsize, pc_tree->split[3]);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, dry_run, subsize,
+                  pc_tree->split[0], rate);
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, dry_run, subsize,
+                  pc_tree->split[1], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, dry_run, subsize,
+                  pc_tree->split[2], rate);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, dry_run,
+                  subsize, pc_tree->split[3], rate);
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->horizontala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+               partition, &pc_tree->horizontala[2], rate);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->horizontalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->horizontalb[2], rate);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, bsize2, partition,
+               &pc_tree->verticala[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+               partition, &pc_tree->verticala[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+               partition, &pc_tree->verticala[2], rate);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, dry_run, subsize, partition,
+               &pc_tree->verticalb[0], rate);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[1], rate);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, dry_run, bsize2,
+               partition, &pc_tree->verticalb[2], rate);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default: assert(0 && "Invalid partition type."); break;
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 // Check to see if the given partition size is allowed for a specified number
-// of 8x8 block rows and columns remaining in the image.
+// of mi block rows and columns remaining in the image.
 // If not then return the largest allowed partition size
 static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
                                       int cols_left, int *bh, int *bw) {
@@ -1611,87 +2413,92 @@
   return bsize;
 }
 
-static void set_partial_b64x64_partition(MODE_INFO *mi, int mis, int bh_in,
-                                         int bw_in, int row8x8_remaining,
-                                         int col8x8_remaining, BLOCK_SIZE bsize,
-                                         MODE_INFO **mi_8x8) {
+static void set_partial_sb_partition(const AV1_COMMON *const cm, MODE_INFO *mi,
+                                     int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining, BLOCK_SIZE bsize,
+                                     MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
-  for (r = 0; r < MAX_MIB_SIZE; r += bh) {
+  for (r = 0; r < cm->mib_size; r += bh) {
     int bw = bw_in;
-    for (c = 0; c < MAX_MIB_SIZE; c += bw) {
-      const int index = r * mis + c;
-      mi_8x8[index] = mi + index;
-      mi_8x8[index]->mbmi.sb_type = find_partition_size(
-          bsize, row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
+    for (c = 0; c < cm->mib_size; c += bw) {
+      const int index = r * cm->mi_stride + c;
+      mib[index] = mi + index;
+      mib[index]->mbmi.sb_type = find_partition_size(
+          bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
     }
   }
 }
 
-// This function attempts to set all mode info entries in a given SB64
+// This function attempts to set all mode info entries in a given superblock
 // to the same block partition size.
 // However, at the bottom and right borders of the image the requested size
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
-                                   MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                                   MODE_INFO **mib, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   AV1_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
-  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
   int bh = num_8x8_blocks_high_lookup[bsize];
   int bw = num_8x8_blocks_wide_lookup[bsize];
 
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
-  // Apply the requested partition size to the SB64 if it is all "in image"
-  if ((col8x8_remaining >= MAX_MIB_SIZE) &&
-      (row8x8_remaining >= MAX_MIB_SIZE)) {
-    for (block_row = 0; block_row < MAX_MIB_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MAX_MIB_SIZE; block_col += bw) {
-        int index = block_row * mis + block_col;
-        mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->mbmi.sb_type = bsize;
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->mib_size) &&
+      (mi_rows_remaining >= cm->mib_size)) {
+    for (block_row = 0; block_row < cm->mib_size; block_row += bh) {
+      for (block_col = 0; block_col < cm->mib_size; block_col += bw) {
+        int index = block_row * cm->mi_stride + block_col;
+        mib[index] = mi_upper_left + index;
+        mib[index]->mbmi.sb_type = bsize;
       }
     }
   } else {
-    // Else this is a partial SB64.
-    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
-                                 col8x8_remaining, bsize, mi_8x8);
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+                             mi_cols_remaining, bsize, mib);
   }
 }
 
 static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
-                             TileDataEnc *tile_data, MODE_INFO **mi_8x8,
+                             TileDataEnc *tile_data, MODE_INFO **mib,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
+#if CONFIG_SUPERTX
+                             int *rate_nocoef,
+#endif
                              int do_recon, PC_TREE *pc_tree) {
   AV1_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int mis = cm->mi_stride;
-  const int bsl = b_width_log2_lookup[bsize];
-  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
-  const int bss = (1 << bsl) / 4;
-  int i, pl;
-  PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE subsize;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+#if CONFIG_SUPERTX
+  int last_part_rate_nocoef = INT_MAX;
+  int none_rate_nocoef = INT_MAX;
+  int chosen_rate_nocoef = INT_MAX;
+#endif
 #if CONFIG_PVQ
   od_rollback_buffer pre_rdo_buf;
 #endif
-
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
   assert(num_4x4_blocks_wide_lookup[bsize] ==
@@ -1701,17 +2508,20 @@
   av1_rd_cost_reset(&none_rdc);
   av1_rd_cost_reset(&chosen_rdc);
 
-  partition = partition_lookup[bsl][bs_type];
-  subsize = get_subsize(bsize, partition);
-
   pc_tree->partitioning = partition;
+
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
 #if !CONFIG_PVQ
-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-  save_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
 
-  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
     x->mb_energy = av1_block_energy(cpi, x, bsize);
   }
@@ -1725,7 +2535,7 @@
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
         if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
           splits_below = 0;
         }
@@ -1735,142 +2545,230 @@
     // If partition is not none try none unless each of the 4 splits are split
     // even further..
     if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + (mi_step >> 1) < cm->mi_rows &&
-        mi_col + (mi_step >> 1) < cm->mi_cols) {
+        mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
-                       ctx_none, INT64_MAX);
-
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+#if CONFIG_SUPERTX
+                       &none_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
+                       bsize, ctx_none, INT64_MAX);
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost =
             RDCOST(x->rdmult, x->rddiv, none_rdc.rate, none_rdc.dist);
+#if CONFIG_SUPERTX
+        none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
       }
 
 #if !CONFIG_PVQ
-      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-      restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+      restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
-      mi_8x8[0]->mbmi.sb_type = bs_type;
+      mib[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
   }
 
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, bsize,
-                       ctx_none, INT64_MAX);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
+                       bsize, ctx_none, INT64_MAX);
       break;
     case PARTITION_HORZ:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
                        subsize, &pc_tree->horizontal[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_row + (mi_step >> 1) < cm->mi_rows) {
+          mi_row + hbs < cm->mi_rows) {
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
         PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx_h);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
-                         &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX);
+        update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx_h, NULL);
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_HORZ,
+#endif
+                         subsize, &pc_tree->horizontal[1], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
     case PARTITION_VERT:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
-          mi_col + (mi_step >> 1) < cm->mi_cols) {
+          mi_col + hbs < cm->mi_cols) {
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
         PICK_MODE_CONTEXT *ctx_v = &pc_tree->vertical[0];
         av1_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx_v);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1),
-                         &tmp_rdc, subsize,
-                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+        update_state(cpi, td, ctx_v, mi_row, mi_col, subsize, 1);
+        encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                          ctx_v, NULL);
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_VERT,
+#endif
+                         subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
+                         INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                         &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_SPLIT,
+#endif
                          subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
       last_part_rdc.rdcost = 0;
+#if CONFIG_SUPERTX
+      last_part_rate_nocoef = 0;
+#endif
       for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * (mi_step >> 1);
-        int y_idx = (i >> 1) * (mi_step >> 1);
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
         int jj = i >> 1, ii = i & 0x01;
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef;
+#endif
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
         av1_rd_cost_init(&tmp_rdc);
-        rd_use_partition(cpi, td, tile_data, mi_8x8 + jj * bss * mis + ii * bss,
-                         tp, mi_row + y_idx, mi_col + x_idx, subsize,
-                         &tmp_rdc.rate, &tmp_rdc.dist, i != 3,
-                         pc_tree->split[i]);
+        rd_use_partition(cpi, td, tile_data,
+                         mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+                         mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+                         &tmp_rdc.dist,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+                         i != 3, pc_tree->split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           av1_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B: assert(0 && "Cannot handle extended partiton types");
+#endif  //  CONFIG_EXT_PARTITION_TYPES
     default: assert(0); break;
   }
 
-  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   if (last_part_rdc.rate < INT_MAX) {
     last_part_rdc.rate += cpi->partition_cost[pl][partition];
     last_part_rdc.rdcost =
         RDCOST(x->rdmult, x->rddiv, last_part_rdc.rate, last_part_rdc.dist);
+#if CONFIG_SUPERTX
+    last_part_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
   }
 
   if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
       cpi->sf.partition_search_type == SEARCH_PARTITION &&
       partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
-      (mi_row + mi_step < cm->mi_rows ||
-       mi_row + (mi_step >> 1) == cm->mi_rows) &&
-      (mi_col + mi_step < cm->mi_cols ||
-       mi_col + (mi_step >> 1) == cm->mi_cols)) {
+      (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
+      (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = 0;
+#endif
 #if !CONFIG_PVQ
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
     for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * (mi_step >> 1);
-      int y_idx = (i >> 1) * (mi_step >> 1);
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
       RD_COST tmp_rdc;
-      ENTROPY_CONTEXT l2[16 * MAX_MB_PLANE], a2[16 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl2[8], sa2[8];
+#if CONFIG_SUPERTX
+      int rt_nocoef = 0;
+#endif
 #if CONFIG_PVQ
       od_rollback_buffer buf;
 #endif
@@ -1878,111 +2776,165 @@
         continue;
 
 #if !CONFIG_PVQ
-      save_context(x, mi_row, mi_col, a2, l2, sa2, sl2, bsize);
+      save_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-      save_context(x, mi_row, mi_col, a2, l2, sa2, sl2, &buf, bsize);
+      save_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
 #endif
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
-                       &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
-                       INT64_MAX);
+                       &tmp_rdc,
+#if CONFIG_SUPERTX
+                       &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
 #if !CONFIG_PVQ
-      restore_context(x, mi_row, mi_col, a2, l2, sa2, sl2, bsize);
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-      restore_context(x, mi_row, mi_col, a2, l2, sa2, sl2, &buf, bsize);
+      restore_context(x, &x_ctx, mi_row, mi_col, &buf, bsize);
 #endif
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
         av1_rd_cost_reset(&chosen_rdc);
+#if CONFIG_SUPERTX
+        chosen_rate_nocoef = INT_MAX;
+#endif
         break;
       }
 
       chosen_rdc.rate += tmp_rdc.rate;
       chosen_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += rt_nocoef;
+#endif
 
       if (i != 3)
-        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx, 0,
-                  split_subsize, pc_tree->split[i]);
+        encode_sb(cpi, td, tile_info, tp, mi_row + y_idx, mi_col + x_idx,
+                  OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
 
-      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
-                                   split_subsize);
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
+#endif
     }
-    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       chosen_rdc.rdcost =
           RDCOST(x->rdmult, x->rddiv, chosen_rdc.rate, chosen_rdc.dist);
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
     }
   }
 
   // If last_part is better set the partitioning to that.
   if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mi_8x8[0]->mbmi.sb_type = bsize;
+    mib[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = last_part_rate_nocoef;
+#endif
   }
   // If none was better set the partitioning to that.
   if (none_rdc.rdcost < chosen_rdc.rdcost) {
     if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
     chosen_rdc = none_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = none_rate_nocoef;
+#endif
   }
 
 #if !CONFIG_PVQ
-  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-  restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+  restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == BLOCK_64X64)
+  if (bsize == cm->sb_size)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    int output_enabled = (bsize == BLOCK_64X64);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      // NOTE: To get estimate for rate due to the tokens, use:
+      // int rate_coeffs = 0;
+      // encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+      //           bsize, pc_tree, &rate_coeffs);
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
   *rate = chosen_rdc.rate;
   *dist = chosen_rdc.dist;
+#if CONFIG_SUPERTX
+  *rate_nocoef = chosen_rate_nocoef;
+#endif
 }
 
+/* clang-format off */
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,  BLOCK_4X4, BLOCK_4X4,
-  BLOCK_4X4,   BLOCK_8X8,   BLOCK_8X8,  BLOCK_8X8, BLOCK_16X16,
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16
+                            BLOCK_4X4,    //                     4x4
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
+  BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
+  BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //  16x32,  32x16,   32x32
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
-  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64
+                                  BLOCK_8X8,    //                     4x4
+  BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,    //    4x8,    8x4,     8x8
+  BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,    //   8x16,   16x8,   16x16
+  BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,    //  16x32,  32x16,   32x32
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
 };
 
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+                              BLOCK_4X4,    //                     4x4
+  BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,    //    4x8,    8x4,     8x8
+  BLOCK_8X8,   BLOCK_8X8,     BLOCK_16X16,  //   8x16,   16x8,   16x16
+  BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
+  BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
+};
+/* clang-format on */
+
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
-// At the moment this is designed to work on a 64x64 SB but could be
+// At the moment this is designed to work on a superblock but could be
 // adjusted to use a size parameter.
 //
 // The min and max are assumed to have been initialized prior to calling this
-// function so repeat calls can accumulate a min and max of more than one sb64.
-static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
+// function so repeat calls can accumulate a min and max of more than one
+// superblock.
+static void get_sb_partition_size_range(const AV1_COMMON *const cm,
+                                        MACROBLOCKD *xd, MODE_INFO **mib,
                                         BLOCK_SIZE *min_block_size,
-                                        BLOCK_SIZE *max_block_size,
-                                        int bs_hist[BLOCK_SIZES]) {
-  int sb_width_in_blocks = MAX_MIB_SIZE;
-  int sb_height_in_blocks = MAX_MIB_SIZE;
+                                        BLOCK_SIZE *max_block_size) {
   int i, j;
   int index = 0;
 
   // Check the sb_type for each block that belongs to this region.
-  for (i = 0; i < sb_height_in_blocks; ++i) {
-    for (j = 0; j < sb_width_in_blocks; ++j) {
-      MODE_INFO *mi = mi_8x8[index + j];
-      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
-      bs_hist[sb_type]++;
+  for (i = 0; i < cm->mib_size; ++i) {
+    for (j = 0; j < cm->mib_size; ++j) {
+      MODE_INFO *mi = mib[index + j];
+      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4;
       *min_block_size = AOMMIN(*min_block_size, sb_type);
       *max_block_size = AOMMAX(*max_block_size, sb_type);
     }
@@ -1990,13 +2942,6 @@
   }
 }
 
-// Next square block size less or equal than current block size.
-static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,   BLOCK_8X8,
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_32X32,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_64X64
-};
-
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
@@ -2007,17 +2952,16 @@
   MODE_INFO **mi = xd->mi;
   const int left_in_image = xd->left_available && mi[-1];
   const int above_in_image = xd->up_available && mi[-xd->mi_stride];
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_4X4;
-  BLOCK_SIZE max_size = BLOCK_64X64;
-  int bs_hist[BLOCK_SIZES] = { 0 };
+  BLOCK_SIZE max_size = BLOCK_LARGEST;
 
   // Trap case where we do not have a prediction.
   if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
     // Default "min to max" and "max to min"
-    min_size = BLOCK_64X64;
+    min_size = BLOCK_LARGEST;
     max_size = BLOCK_4X4;
 
     // NOTE: each call to get_sb_partition_size_range() uses the previous
@@ -2026,19 +2970,17 @@
     if (cm->frame_type != KEY_FRAME) {
       MODE_INFO **prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
-      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
+      get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
     }
-    // Find the min and max partition sizes used in the left SB64
+    // Find the min and max partition sizes used in the left superblock
     if (left_in_image) {
-      MODE_INFO **left_sb64_mi = &mi[-MAX_MIB_SIZE];
-      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
-                                  bs_hist);
+      MODE_INFO **left_sb_mi = &mi[-cm->mib_size];
+      get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
     }
-    // Find the min and max partition sizes used in the above SB64.
+    // Find the min and max partition sizes used in the above suprblock.
     if (above_in_image) {
-      MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MAX_MIB_SIZE];
-      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
-                                  bs_hist);
+      MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size];
+      get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
     }
 
     // Adjust observed min and max for "relaxed" auto partition case.
@@ -2049,28 +2991,28 @@
   }
 
   // Check border cases where max and min from neighbors may not be legal.
-  max_size = find_partition_size(max_size, row8x8_remaining, col8x8_remaining,
+  max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
                                  &bh, &bw);
+  min_size = AOMMIN(min_size, max_size);
+
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
   if (av1_active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
-    min_size =
-        AOMMIN(cpi->sf.rd_auto_partition_min_limit, AOMMIN(min_size, max_size));
+    min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
   }
 
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
   // *min_block_size.
-  if (cpi->sf.use_square_partition_only &&
-      next_square_size[max_size] < min_size) {
-    min_size = next_square_size[max_size];
+  if (cpi->sf.use_square_partition_only) {
+    min_size = AOMMIN(min_size, next_square_size[max_size]);
   }
 
-  *min_block_size = min_size;
-  *max_block_size = max_size;
+  *min_block_size = AOMMIN(min_size, cm->sb_size);
+  *max_block_size = AOMMIN(max_size, cm->sb_size);
 }
 
 // TODO(jingning) refactor functions setting partition search range
@@ -2122,8 +3064,8 @@
     max_size = max_partition_size[max_size];
   }
 
-  *min_bs = min_size;
-  *max_bs = max_size;
+  *min_bs = AOMMIN(min_size, cm->sb_size);
+  *max_bs = AOMMIN(max_size, cm->sb_size);
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
@@ -2135,18 +3077,68 @@
 }
 
 #if CONFIG_FP_MB_STATS
-const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        1, 2, 2, 2, 4, 4 };
-const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
-                                                        2, 1, 2, 4, 2, 4 };
 const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
-  0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120
+  0,
+  10,
+  10,
+  30,
+  40,
+  40,
+  60,
+  80,
+  80,
+  90,
+  100,
+  100,
+  120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  130,
+  130,
+  150
+#endif  // CONFIG_EXT_PARTITION
 };
 const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
-  0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120
+  0,
+  3,
+  3,
+  7,
+  15,
+  15,
+  30,
+  40,
+  40,
+  60,
+  80,
+  80,
+  120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  160,
+  160,
+  240
+#endif  // CONFIG_EXT_PARTITION
 };
 const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  1,
+  4,
+  4,
+  6
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  8,
+  8,
+  10
+#endif  // CONFIG_EXT_PARTITION
 };
 
 typedef enum {
@@ -2182,28 +3174,222 @@
 }
 #endif
 
+#if CONFIG_EXT_PARTITION_TYPES
+static void rd_test_partition3(
+    const AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data,
+    TOKENEXTRA **tp, PC_TREE *pc_tree, RD_COST *best_rdc,
+    PICK_MODE_CONTEXT ctxs[3], PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+    BLOCK_SIZE bsize, PARTITION_TYPE partition,
+#if CONFIG_SUPERTX
+    int64_t best_rd, int *best_rate_nocoef, RD_SEARCH_MACROBLOCK_CONTEXT *x_ctx,
+#endif
+    int mi_row0, int mi_col0, BLOCK_SIZE subsize0, int mi_row1, int mi_col1,
+    BLOCK_SIZE subsize1, int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COST this_rdc, sum_rdc;
+#if CONFIG_SUPERTX
+  const AV1_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  int this_rate_nocoef, sum_rate_nocoef;
+  int abort_flag;
+  const int supertx_allowed = !frame_is_intra_only(cm) &&
+                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+                              !xd->lossless[0];
+#endif
+  if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+
+  rd_pick_sb_modes(cpi, tile_data, x, mi_row0, mi_col0, &sum_rdc,
+#if CONFIG_SUPERTX
+                   &sum_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                   partition,
+#endif
+                   subsize0, &ctxs[0], best_rdc->rdcost);
+#if CONFIG_SUPERTX
+  abort_flag = sum_rdc.rdcost >= best_rd;
+#endif
+
+#if CONFIG_SUPERTX
+  if (sum_rdc.rdcost < INT64_MAX) {
+#else
+  if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+    PICK_MODE_CONTEXT *ctx_0 = &ctxs[0];
+    update_state(cpi, td, ctx_0, mi_row0, mi_col0, subsize0, 1);
+    encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row0, mi_col0, subsize0,
+                      ctx_0, NULL);
+
+    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_0);
+
+#if CONFIG_SUPERTX
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+                     &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], INT64_MAX - sum_rdc.rdcost);
+#else
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef = INT_MAX;
+#endif
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += this_rate_nocoef;
+#endif
+    }
+
+#if CONFIG_SUPERTX
+    if (sum_rdc.rdcost < INT64_MAX) {
+#else
+    if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+      PICK_MODE_CONTEXT *ctx_1 = &ctxs[1];
+      update_state(cpi, td, ctx_1, mi_row1, mi_col1, subsize1, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row1, mi_col1, subsize1,
+                        ctx_1, NULL);
+
+      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_1);
+
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], INT64_MAX - sum_rdc.rdcost);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif
+      }
+
+#if CONFIG_SUPERTX
+      if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+        pc_tree->partitioning = partition;
+        sum_rdc.rate += av1_cost_bit(
+            cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                                [supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_COST tmp_rdc = { sum_rate_nocoef, 0, 0 };
+
+          restore_context(x, x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+          tmp_rdc.rate += av1_cost_bit(
+              cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                                  [supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
+
+      if (sum_rdc.rdcost < best_rdc->rdcost) {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        sum_rdc.rate += cpi->partition_cost[pl][partition];
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
+        if (sum_rdc.rdcost < best_rdc->rdcost) {
+#if CONFIG_SUPERTX
+          *best_rate_nocoef = sum_rate_nocoef;
+          assert(*best_rate_nocoef >= 0);
+#endif
+          *best_rdc = sum_rdc;
+          pc_tree->partitioning = partition;
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
 static void rd_pick_partition(const AV1_COMP *const cpi, ThreadData *td,
                               TileDataEnc *tile_data, TOKENEXTRA **tp,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              RD_COST *rd_cost, int64_t best_rd,
-                              PC_TREE *pc_tree) {
+                              RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                              int *rate_nocoef,
+#endif
+                              int64_t best_rd, PC_TREE *pc_tree) {
   const AV1_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   const TOKENEXTRA *const tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int *partition_cost = cpi->partition_cost[pl];
+  int tmp_partition_cost[PARTITION_TYPES];
   BLOCK_SIZE subsize;
   RD_COST this_rdc, sum_rdc, best_rdc;
+#if CONFIG_SUPERTX
+  int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
+  int abort_flag;
+  const int supertx_allowed = !frame_is_intra_only(cm) &&
+                              bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+                              !xd->lossless[0];
+#endif  // CONFIG_SUPERTX
   const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
   int do_square_split = bsize_at_least_8x8;
   int do_rectangular_split = 1;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
 
   // Override skipping rectangular partition operations for edge blocks
   const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
@@ -2231,6 +3417,39 @@
 
   (void)*tp_orig;
 
+  if (force_horz_split || force_vert_split) {
+    tmp_partition_cost[PARTITION_NONE] = INT_MAX;
+
+    if (!force_vert_split) {  // force_horz_split only
+      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+      tmp_partition_cost[PARTITION_HORZ] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0);
+      tmp_partition_cost[PARTITION_SPLIT] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1);
+    } else if (!force_horz_split) {  // force_vert_split only
+      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+      tmp_partition_cost[PARTITION_VERT] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0);
+      tmp_partition_cost[PARTITION_SPLIT] =
+          av1_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1);
+    } else {  // force_ horz_split && force_vert_split horz_split
+      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+
+#if CONFIG_VAR_TX
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to magic number
+  // when debugging.
+  memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0]));
+#endif  // NDEBUG
+#endif  // CONFIG_VAR_TX
+
   assert(num_8x8_blocks_wide_lookup[bsize] ==
          num_8x8_blocks_high_lookup[bsize]);
 
@@ -2241,7 +3460,7 @@
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
-  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
     x->mb_energy = av1_block_energy(cpi, x, bsize);
 
   if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
@@ -2269,10 +3488,16 @@
     partition_horz_allowed &= force_horz_split;
     partition_vert_allowed &= force_vert_split;
   }
+
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
 #if !CONFIG_PVQ
-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-  save_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+  save_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
 
 #if CONFIG_FP_MB_STATS
@@ -2336,27 +3561,39 @@
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize,
-                     ctx_none, best_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+#if CONFIG_SUPERTX
+                     &this_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_NONE,
+#endif
+                     bsize, ctx_none, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize_at_least_8x8) {
-        const int partition_context =
-            partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rdc.rate += cpi->partition_cost[partition_context][PARTITION_NONE];
+        this_rdc.rate += partition_cost[PARTITION_NONE];
         this_rdc.rdcost =
             RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+#if CONFIG_SUPERTX
+        this_rate_nocoef += partition_cost[PARTITION_NONE];
+#endif
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
         // Adjust dist breakout threshold according to the partition size.
         const int64_t dist_breakout_thr =
             cpi->sf.partition_search_breakout_dist_thr >>
-            (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+            ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+             (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
         const int rate_breakout_thr =
             cpi->sf.partition_search_breakout_rate_thr *
             num_pels_log2_lookup[bsize];
 
         best_rdc = this_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = this_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif
         if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
 
         // If all y, u, v transform blocks in this partition are skippable, and
@@ -2419,9 +3656,9 @@
       }
     }
 #if !CONFIG_PVQ
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
   }
 
@@ -2435,16 +3672,84 @@
     int reached_last_index = 0;
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     if (bsize == BLOCK_8X8) {
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+        pc_tree->leaf_split[0]->pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter[0];
+#else
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
             ctx_none->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                       pc_tree->leaf_split[0], best_rdc.rdcost);
-      if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX;
+#endif
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+                       &sum_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], INT64_MAX);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], best_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+      if (sum_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      }
+#if CONFIG_SUPERTX
+      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate +=
+            av1_cost_bit(cm->fc->supertx_prob
+                             [partition_supertx_context_lookup[PARTITION_SPLIT]]
+                             [supertx_size],
+                         0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_COST tmp_rdc = { sum_rate_nocoef, 0, 0 };
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+          tmp_rdc.rate += av1_cost_bit(
+              cm->fc->supertx_prob
+                  [partition_supertx_context_lookup[PARTITION_SPLIT]]
+                  [supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
       reached_last_index = 1;
     } else {
       int idx;
+#if CONFIG_SUPERTX
+      for (idx = 0; idx < 4 && sum_rdc.rdcost < INT64_MAX; ++idx) {
+#else
       for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+#endif  // CONFIG_SUPERTX
         const int x_idx = (idx & 1) * mi_step;
         const int y_idx = (idx >> 1) * mi_step;
 
@@ -2454,30 +3759,88 @@
         if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
 
         pc_tree->split[idx]->index = idx;
+#if CONFIG_SUPERTX
+        rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+                          mi_col + x_idx, subsize, &this_rdc, &this_rate_nocoef,
+                          INT64_MAX - sum_rdc.rdcost, pc_tree->split[idx]);
+#else
         rd_pick_partition(
             cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
             &this_rdc, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[idx]);
+#endif  // CONFIG_SUPERTX
 
         if (this_rdc.rate == INT_MAX) {
           sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
           break;
         } else {
           sum_rdc.rate += this_rdc.rate;
           sum_rdc.dist += this_rdc.dist;
           sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
         }
       }
       reached_last_index = (idx == 4);
+#if CONFIG_SUPERTX
+      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && reached_last_index) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate +=
+            av1_cost_bit(cm->fc->supertx_prob
+                             [partition_supertx_context_lookup[PARTITION_SPLIT]]
+                             [supertx_size],
+                         0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_COST tmp_rdc = { sum_rate_nocoef, 0, 0 };
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+          tmp_rdc.rate += av1_cost_bit(
+              cm->fc->supertx_prob
+                  [partition_supertx_context_lookup[PARTITION_SPLIT]]
+                  [supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
     }
 
     if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
-      const int partition_context =
-          partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rdc.rate += cpi->partition_cost[partition_context][PARTITION_SPLIT];
+      sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
+#endif  // CONFIG_SUPERTX
 
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else if (cpi->sf.less_rectangular_check) {
@@ -2486,61 +3849,152 @@
       do_rectangular_split &= !partition_none_allowed;
     }
 #if !CONFIG_PVQ
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
-  }
+  }  // if (do_split)
 
   // PARTITION_HORZ
   if (partition_horz_allowed &&
       (do_rectangular_split || av1_active_h_edge(cpi, mi_row, mi_step))) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+#if CONFIG_DUAL_FILTER
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->horizontal[0].pred_interp_filter =
+          ctx_none->mic.mbmi.interp_filter[0];
+#else
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
           ctx_none->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->horizontal[0], best_rdc.rdcost);
+#endif
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_HORZ,
+#endif
+                     subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
 
-    if (sum_rdc.rdcost < best_rdc.rdcost && !force_horz_split &&
-        bsize > BLOCK_8X8) {
+#if CONFIG_SUPERTX
+    abort_flag = (sum_rdc.rdcost >= best_rd && bsize > BLOCK_8X8) ||
+                 (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        !force_horz_split && bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx_h = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx_h);
+      update_state(cpi, td, ctx_h, mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        ctx_h, NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
+
+#if CONFIG_DUAL_FILTER
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
-            ctx_h->mic.mbmi.interp_filter;
+            ctx_h->mic.mbmi.interp_filter[0];
+#else
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->horizontal[1].pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
+                       subsize, &pc_tree->horizontal[1], INT64_MAX);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
                        subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
       }
     }
 
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      const int partition_context =
-          partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rdc.rate += cpi->partition_cost[partition_context][PARTITION_HORZ];
+#if CONFIG_SUPERTX
+    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+      pc_tree->partitioning = PARTITION_HORZ;
+
+      sum_rdc.rate += av1_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+                              [supertx_size],
+          0);
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+        TX_TYPE best_tx = DCT_DCT;
+        RD_COST tmp_rdc = { sum_rate_nocoef, 0, 0 };
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
+                      &tmp_rdc.dist, &best_tx, pc_tree);
+
+        tmp_rdc.rate += av1_cost_bit(
+            cm->fc
+                ->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+                              [supertx_size],
+            1);
+        tmp_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+          sum_rdc = tmp_rdc;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                  supertx_size, pc_tree);
+        }
+      }
+
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_HORZ];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_HORZ];
+#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
 #if !CONFIG_PVQ
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
   }
 
@@ -2550,67 +4004,224 @@
     subsize = get_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+#if CONFIG_DUAL_FILTER
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->vertical[0].pred_interp_filter =
+          ctx_none->mic.mbmi.interp_filter[0];
+#else
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
           ctx_none->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->vertical[0], best_rdc.rdcost);
-    if (sum_rdc.rdcost < best_rdc.rdcost && !force_vert_split &&
-        bsize > BLOCK_8X8) {
-      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
-                        &pc_tree->vertical[0]);
+#endif
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_VERT,
+#endif
+                     subsize, &pc_tree->vertical[0], best_rdc.rdcost);
+#if CONFIG_SUPERTX
+    abort_flag = (sum_rdc.rdcost >= best_rd && bsize > BLOCK_8X8) ||
+                 (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        !force_vert_split && bsize > BLOCK_8X8) {
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 1);
+      encode_superblock(cpi, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize,
+                        &pc_tree->vertical[0], NULL);
 
       if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->vertical[1].pred_interp_filter =
+            ctx_none->mic.mbmi.interp_filter[0];
+#else
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
             ctx_none->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize, &pc_tree->vertical[1],
+                       INT64_MAX - sum_rdc.rdcost);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
                        subsize, &pc_tree->vertical[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
       }
     }
+#if CONFIG_SUPERTX
+    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+      pc_tree->partitioning = PARTITION_VERT;
+
+      sum_rdc.rate += av1_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+                              [supertx_size],
+          0);
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+        TX_TYPE best_tx = DCT_DCT;
+        RD_COST tmp_rdc = { sum_rate_nocoef, 0, 0 };
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize, &tmp_rdc.rate,
+                      &tmp_rdc.dist, &best_tx, pc_tree);
+
+        tmp_rdc.rate += av1_cost_bit(
+            cm->fc
+                ->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+                              [supertx_size],
+            1);
+        tmp_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+          sum_rdc = tmp_rdc;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                  supertx_size, pc_tree);
+        }
+      }
+
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      const int partition_context =
-          partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rdc.rate += cpi->partition_cost[partition_context][PARTITION_VERT];
+      sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_VERT];
+#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
 #if !CONFIG_PVQ
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 #else
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, &pre_rdo_buf, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, &pre_rdo_buf, bsize);
 #endif
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  // PARTITION_HORZ_A
+  if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_HORZ_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2,
+                       mi_row + mi_step, mi_col, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_HORZ_B
+  if (partition_horz_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_HORZ_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize, mi_row + mi_step, mi_col,
+                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_A
+  if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_VERT_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2,
+                       mi_row, mi_col + mi_step, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_B
+  if (partition_vert_allowed && do_rectangular_split && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
+                       PARTITION_VERT_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize, mi_row, mi_col + mi_step,
+                       bsize2, mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
   // point.  This code should be refactored so that the duplicate
   // checks occur in some sub function and thus are used...
   (void)best_rd;
   *rd_cost = best_rdc;
+#if CONFIG_SUPERTX
+  *rate_nocoef = best_rate_nocoef;
+#endif  // CONFIG_SUPERTX
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    int output_enabled = (bsize == BLOCK_64X64);
-    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
-              pc_tree);
+    if (bsize == cm->sb_size) {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+                pc_tree, NULL);
+    } else {
+      encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+                pc_tree, NULL);
+    }
   }
 
-  if (bsize == BLOCK_64X64) {
+  if (bsize == cm->sb_size) {
 #if !CONFIG_PVQ
     assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
 #endif
@@ -2625,15 +4236,19 @@
                              TileDataEnc *tile_data, int mi_row,
                              TOKENEXTRA **tp) {
   AV1_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
+  const TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+#else
+  const int leaf_nodes = 64;
+#endif  // CONFIG_EXT_PARTITION
 
   // Initialize the left context for the new SB row
-  memset(&xd->left_context, 0, sizeof(xd->left_context));
-  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+  av1_zero_left_context(xd);
 
 #if CONFIG_DELTA_Q
   // Reset delta for every tile
@@ -2643,21 +4258,26 @@
 
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
-       mi_col += MAX_MIB_SIZE) {
+       mi_col += cm->mib_size) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
     RD_COST dummy_rdc;
+#if CONFIG_SUPERTX
+    int dummy_rate_nocoef;
+#endif  // CONFIG_SUPERTX
     int i;
     int seg_skip = 0;
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
     if (sf->adaptive_pred_interp_filter) {
-      for (i = 0; i < 64; ++i) td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+      for (i = 0; i < leaf_nodes; ++i)
+        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
-      for (i = 0; i < 64; ++i) {
+      for (i = 0; i < leaf_nodes; ++i) {
         td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -2666,12 +4286,12 @@
     }
 
     av1_zero(x->pred_mv);
-    td->pc_root->index = 0;
+    pc_root->index = 0;
 
     if (seg->enabled) {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
@@ -2696,60 +4316,94 @@
       set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
       xd->mi[0]->mbmi.current_q_index = current_qindex;
       xd->mi[0]->mbmi.segment_id = 0;
-      av1_init_plane_quantizers(cpi, x);
+      av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
     }
 #endif
 
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
-      const BLOCK_SIZE bsize =
-          seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+      bsize = seg_skip ? cm->sb_size : sf->always_this_block_size;
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, td->pc_root);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+                       &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
       bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, td->pc_root);
-    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               cm->frame_type != KEY_FRAME) {
-      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
-      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, td->pc_root);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+                       &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+      choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, cm->sb_size,
+                       &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rdc, INT64_MAX, td->pc_root);
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
+                        &dummy_rdc,
+#if CONFIG_SUPERTX
+                        &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                        INT64_MAX, pc_root);
     }
   }
+#if CONFIG_ENTROPY
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    if ((mi_row + MI_SIZE) %
+                (MI_SIZE *
+                 AOMMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1)) ==
+            0 &&
+        mi_row + MI_SIZE < cm->mi_rows &&
+        cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
+      TX_SIZE t;
+      SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+      for (t = TX_4X4; t <= TX_32X32; ++t)
+        av1_full_to_model_counts(cpi->td.counts->coef[t],
+                                 cpi->td.rd_counts.coef_counts[t]);
+      av1_partial_adapt_probs(cm, mi_row, mi_col);
+      ++cm->coef_probs_update_idx;
+      av1_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
+               cm->fc->coef_probs);
+      av1_copy(subframe_stats->coef_counts_buf[cm->coef_probs_update_idx],
+               cpi->td.rd_counts.coef_counts);
+      av1_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
+               cm->counts.eob_branch);
+      av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
+    }
+  }
+#endif  // CONFIG_ENTROPY
 }
 
 static void init_encode_frame_mb_context(AV1_COMP *cpi) {
   MACROBLOCK *const x = &cpi->td.mb;
   AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
 
   // Copy data over into macro block data structures.
   av1_setup_src_planes(x, cpi->Source, 0, 0);
 
-  av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(xd->above_context[0], 0,
-         sizeof(*xd->above_context[0]) * 2 * aligned_mi_cols * MAX_MB_PLANE);
-  memset(xd->above_seg_context, 0,
-         sizeof(*xd->above_seg_context) * aligned_mi_cols);
+  av1_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 }
 
 static int check_dual_ref_flags(AV1_COMP *cpi) {
@@ -2767,6 +4421,7 @@
   }
 }
 
+#if !CONFIG_VAR_TX
 static void reset_skip_tx_size(AV1_COMMON *cm, TX_SIZE max_tx_size) {
   int mi_row, mi_col;
   const int mis = cm->mi_stride;
@@ -2774,20 +4429,28 @@
 
   for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
     for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-      if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size)
+      if (txsize_sqr_up_map[mi_ptr[mi_col]->mbmi.tx_size] > max_tx_size)
         mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
     }
   }
 }
+#endif
 
 static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
-  if (frame_is_intra_only(&cpi->common))
-    return INTRA_FRAME;
+  if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
+#if CONFIG_EXT_REFS
+  // We will not update the golden frame with an internal overlay frame
+  else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
+           cpi->rc.is_src_frame_ext_arf)
+#else
   else if (cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame)
+#endif
     return ALTREF_FRAME;
   else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
     return GOLDEN_FRAME;
   else
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     return LAST_FRAME;
 }
 
@@ -2804,11 +4467,11 @@
 
 void av1_init_tile_data(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   int tile_col, tile_row;
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
-  int tile_tok = 0;
+  unsigned int tile_tok = 0;
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
@@ -2818,7 +4481,7 @@
 
     for (tile_row = 0; tile_row < tile_rows; ++tile_row)
       for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
+        TileDataEnc *const tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
         for (i = 0; i < BLOCK_SIZES; ++i) {
@@ -2840,7 +4503,7 @@
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo *tile_info =
+      TileInfo *const tile_info =
           &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
       av1_tile_init(tile_info, cm, tile_row, tile_col);
 
@@ -2857,8 +4520,8 @@
 void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
                      int tile_col) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+  TileDataEnc *const this_tile =
+      &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
   const TileInfo *const tile_info = &this_tile->tile_info;
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
@@ -2866,9 +4529,13 @@
   od_adapt_ctx *adapt;
 #endif
 
+  av1_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
+
   // Set up pointers to per thread motion search counters.
-  td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
-  td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
+  this_tile->m_search_count = 0;   // Count of motion search hits.
+  this_tile->ex_search_count = 0;  // Exhaustive mesh search hits.
+  td->mb.m_search_count_ptr = &this_tile->m_search_count;
+  td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
 
 #if CONFIG_PVQ
   td->mb.pvq_q = &this_tile->pvq_q;
@@ -2900,13 +4567,13 @@
 #endif
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += MAX_MIB_SIZE) {
+       mi_row += cm->mib_size) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
   }
+
   cpi->tok_count[tile_row][tile_col] =
       (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
-         allocated_tokens(*tile_info));
+  assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info));
 #if CONFIG_PVQ
   od_ec_enc_clear(&td->mb.daala_enc.ec);
 
@@ -2921,14 +4588,12 @@
 
 static void encode_tiles(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_col, tile_row;
 
   av1_init_tile_data(cpi);
 
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col)
       av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
 }
 
@@ -2946,6 +4611,153 @@
 }
 #endif
 
+#if CONFIG_GLOBAL_MOTION
+#define MIN_TRANS_THRESH 8
+#define GLOBAL_MOTION_ADVANTAGE_THRESH 0.60
+#define GLOBAL_MOTION_MODEL ROTZOOM
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+                                int32_t offset) {
+  const int scale_vals[2] = { GM_ALPHA_PREC_DIFF, GM_TRANS_PREC_DIFF };
+  const int clamp_vals[2] = { GM_ALPHA_MAX, GM_TRANS_MAX };
+  const int is_trans_param = param_index < 2;
+  const int is_one_centered = (!is_trans_param) && (param_index & 1);
+
+  // Make parameter zero-centered and offset the shift that was done to make
+  // it compatible with the warped model
+  param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+                scale_vals[is_trans_param];
+  // Add desired offset to the rescaled/zero-centered parameter
+  param_value += offset;
+  // Clamp the parameter so it does not overflow the number of bits allotted
+  // to it in the bitstream
+  param_value = (int32_t)clamp(param_value, -clamp_vals[is_trans_param],
+                               clamp_vals[is_trans_param]);
+  // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+  // with the warped motion library
+  param_value *= (1 << scale_vals[is_trans_param]);
+
+  // Undo the zero-centering step if necessary
+  return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void refine_integerized_param(WarpedMotionParams *wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                                     int use_hbd, int bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                                     uint8_t *ref, int r_width, int r_height,
+                                     int r_stride, uint8_t *dst, int d_width,
+                                     int d_height, int d_stride,
+                                     int n_refinements) {
+  int i = 0, p;
+  int n_params = n_trans_model_params[wm->wmtype];
+  int32_t *param_mat = wm->wmmat;
+  double step_error;
+  int32_t step;
+  int32_t *param;
+  int32_t curr_param;
+  int32_t best_param;
+
+  double best_error =
+      av1_warp_erroradv(wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                        use_hbd, bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                        ref, r_width, r_height, r_stride, dst, 0, 0, d_width,
+                        d_height, d_stride, 0, 0, 16, 16);
+  for (p = 0; p < n_params; ++p) {
+    param = param_mat + p;
+    step = 1 << (n_refinements + 1);
+    curr_param = *param;
+    best_param = curr_param;
+    for (i = 0; i < n_refinements; i++) {
+      // look to the left
+      *param = add_param_offset(p, curr_param, -step);
+      step_error =
+          av1_warp_erroradv(wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                            use_hbd, bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                            ref, r_width, r_height, r_stride, dst, 0, 0,
+                            d_width, d_height, d_stride, 0, 0, 16, 16);
+      if (step_error < best_error) {
+        step >>= 1;
+        best_error = step_error;
+        best_param = *param;
+        curr_param = best_param;
+        continue;
+      }
+
+      // look to the right
+      *param = add_param_offset(p, curr_param, step);
+      step_error =
+          av1_warp_erroradv(wm,
+#if CONFIG_AOM_HIGHBITDEPTH
+                            use_hbd, bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                            ref, r_width, r_height, r_stride, dst, 0, 0,
+                            d_width, d_height, d_stride, 0, 0, 16, 16);
+      if (step_error < best_error) {
+        step >>= 1;
+        best_error = step_error;
+        best_param = *param;
+        curr_param = best_param;
+        continue;
+      }
+
+      // no improvement found-> means we're either already at a minimum or
+      // step is too wide
+      step >>= 1;
+    }
+
+    *param = best_param;
+  }
+}
+
+static void convert_to_params(const double *params, TransformationType type,
+                              int32_t *model) {
+  int i, diag_value;
+  int alpha_present = 0;
+  int n_params = n_trans_model_params[type];
+  model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+  model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+  model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+             GM_TRANS_DECODE_FACTOR;
+
+  for (i = 2; i < n_params; ++i) {
+    diag_value = ((i & 1) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+    model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+    model[i] =
+        (int32_t)(clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX) +
+                  diag_value) *
+        GM_ALPHA_DECODE_FACTOR;
+    alpha_present |= (model[i] != 0);
+  }
+
+  if (!alpha_present) {
+    if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) {
+      model[0] = 0;
+      model[1] = 0;
+    }
+  }
+}
+
+static void convert_model_to_params(const double *params,
+                                    TransformationType type,
+                                    Global_Motion_Params *model) {
+  // TODO(sarahparker) implement for homography
+  if (type > HOMOGRAPHY)
+    convert_to_params(params, type, model->motion_params.wmmat);
+  model->gmtype = get_gmtype(model);
+  model->motion_params.wmtype = gm_to_trans_type(model->gmtype);
+}
+#endif  // CONFIG_GLOBAL_MOTION
+
 static void encode_frame_internal(AV1_COMP *cpi) {
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
@@ -2954,6 +4766,8 @@
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
   int i;
 
+  x->min_partition_size = AOMMIN(x->min_partition_size, cm->sb_size);
+  x->max_partition_size = AOMMIN(x->max_partition_size, cm->sb_size);
 #if CONFIG_SIMP_MV_PRED
   cm->setup_mi(cm);
 #endif
@@ -2964,8 +4778,51 @@
   av1_zero(*td->counts);
   av1_zero(rdc->coef_counts);
   av1_zero(rdc->comp_pred_diff);
-  rdc->m_search_count = 0;   // Count of motion search hits.
-  rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
+
+#if CONFIG_GLOBAL_MOTION
+  aom_clear_system_state();
+  av1_zero(cpi->global_motion_used);
+  if (cpi->common.frame_type == INTER_FRAME && cpi->Source) {
+    YV12_BUFFER_CONFIG *ref_buf;
+    int frame;
+    double erroradvantage = 0;
+    double params[9] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+      ref_buf = get_ref_frame_buffer(cpi, frame);
+      if (ref_buf) {
+        if (compute_global_motion_feature_based(GLOBAL_MOTION_MODEL,
+                                                cpi->Source, ref_buf, params)) {
+          convert_model_to_params(params, GLOBAL_MOTION_MODEL,
+                                  &cm->global_motion[frame]);
+          if (get_gmtype(&cm->global_motion[frame]) > GLOBAL_ZERO) {
+            refine_integerized_param(
+                &cm->global_motion[frame].motion_params,
+#if CONFIG_AOM_HIGHBITDEPTH
+                xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
+                ref_buf->y_stride, cpi->Source->y_buffer, cpi->Source->y_width,
+                cpi->Source->y_height, cpi->Source->y_stride, 3);
+            // compute the advantage of using gm parameters over 0 motion
+            erroradvantage = av1_warp_erroradv(
+                &cm->global_motion[frame].motion_params,
+#if CONFIG_AOM_HIGHBITDEPTH
+                xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                ref_buf->y_buffer, ref_buf->y_width, ref_buf->y_height,
+                ref_buf->y_stride, cpi->Source->y_buffer, 0, 0,
+                cpi->Source->y_width, cpi->Source->y_height,
+                cpi->Source->y_stride, 0, 0, 16, 16);
+            if (erroradvantage > GLOBAL_MOTION_ADVANTAGE_THRESH)
+              // Not enough advantage in using a global model. Make 0.
+              memset(&cm->global_motion[frame], 0,
+                     sizeof(cm->global_motion[frame]));
+          }
+        }
+      }
+    }
+  }
+#endif  // CONFIG_GLOBAL_MOTION
 
   for (i = 0; i < MAX_SEGMENTS; ++i) {
     const int qindex = cm->seg.enabled
@@ -2978,16 +4835,15 @@
   if (!cm->seg.enabled && xd->lossless[0]) x->optimize = 0;
 
   cm->tx_mode = select_tx_mode(cpi, xd);
-
   av1_frame_init_quantizer(cpi);
 
   av1_initialize_rd_consts(cpi);
   av1_initialize_me_consts(cpi, x, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
+
   cm->use_prev_frame_mvs =
       !cm->error_resilient_mode && cm->width == cm->last_width &&
-      cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame &&
-      (cm->last_frame_type != KEY_FRAME);
+      cm->height == cm->last_height && !cm->intra_only && cm->last_show_frame;
 
 #if CONFIG_DELTA_Q
   // Fix delta q resolution for the moment
@@ -3017,7 +4873,16 @@
   cm->prev_mi =
       cm->use_prev_frame_mvs ? cm->prev_mip + cm->mi_stride + 1 : NULL;
 
-  x->quant_fp = cpi->sf.use_quant_fp;
+#if CONFIG_VAR_TX
+  x->txb_split_count = 0;
+#if CONFIG_REF_MV
+  av1_zero(x->blk_skip_drl);
+#endif
+#endif
+
+  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+      cpi->td.var_root[0] == NULL)
+    av1_setup_var_tree(&cpi->common, &cpi->td);
 
   {
     struct aom_usec_timer emr_timer;
@@ -3031,7 +4896,10 @@
 #endif
 
     // If allowed, encoding tiles in parallel with one thread handling one tile.
-    if (AOMMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+    // TODO(geza.lore): The multi-threaded encoder is not safe with more than
+    // 1 tile rows, as it uses the single above_context et al arrays from
+    // cpi->common
+    if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1)
       av1_encode_tiles_mt(cpi);
     else
       encode_tiles(cpi);
@@ -3046,11 +4914,6 @@
 #endif
 }
 
-static InterpFilter get_interp_filter(AV1_COMP *cpi) {
-  (void)cpi;
-  return SWITCHABLE;
-}
-
 void av1_encode_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
 
@@ -3068,6 +4931,7 @@
       cpi->allow_comp_inter_inter = 0;
     } else {
       cpi->allow_comp_inter_inter = 1;
+
 #if CONFIG_EXT_REFS
       cm->comp_fwd_ref[0] = LAST_FRAME;
       cm->comp_fwd_ref[1] = LAST2_FRAME;
@@ -3095,9 +4959,12 @@
     // either compound, single or hybrid prediction as per whatever has
     // worked best for that type of frame in the past.
     // It also predicts whether another coding mode would have worked
-    // better that this coding mode. If that is the case, it remembers
+    // better than this coding mode. If that is the case, it remembers
     // that for subsequent frames.
     // It does the same analysis for transform size selection also.
+    //
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
     const int is_alt_ref = frame_type == ALTREF_FRAME;
@@ -3114,8 +4981,9 @@
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
 
-    if (cm->interp_filter == SWITCHABLE)
-      cm->interp_filter = get_interp_filter(cpi);
+#if CONFIG_DUAL_FILTER
+    cm->interp_filter = SWITCHABLE;
+#endif
 
     encode_frame_internal(cpi);
 
@@ -3140,69 +5008,258 @@
       }
     }
 
+#if CONFIG_VAR_TX
+    if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+      cm->tx_mode = ALLOW_32X32;
+#else
     if (cm->tx_mode == TX_MODE_SELECT) {
       int count4x4 = 0;
       int count8x8_lp = 0, count8x8_8x8p = 0;
       int count16x16_16x16p = 0, count16x16_lp = 0;
       int count32x32 = 0;
-
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += counts->tx.p32x32[i][TX_4X4];
-        count4x4 += counts->tx.p16x16[i][TX_4X4];
-        count4x4 += counts->tx.p8x8[i][TX_4X4];
+        // counts->tx_size[max_depth][context_idx][this_depth_level]
+        count4x4 += counts->tx_size[0][i][0];
+        count4x4 += counts->tx_size[1][i][0];
+        count4x4 += counts->tx_size[2][i][0];
 
-        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
-        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
-        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
+        count8x8_lp += counts->tx_size[1][i][1];
+        count8x8_lp += counts->tx_size[2][i][1];
+        count8x8_8x8p += counts->tx_size[0][i][1];
 
-        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
-        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
-        count32x32 += counts->tx.p32x32[i][TX_32X32];
+        count16x16_16x16p += counts->tx_size[1][i][2];
+        count16x16_lp += counts->tx_size[2][i][2];
+        count32x32 += counts->tx_size[2][i][3];
       }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      count4x4 += counts->tx_size_implied[0][TX_4X4];
+      count4x4 += counts->tx_size_implied[1][TX_4X4];
+      count4x4 += counts->tx_size_implied[2][TX_4X4];
+      count4x4 += counts->tx_size_implied[3][TX_4X4];
+      count8x8_lp += counts->tx_size_implied[2][TX_8X8];
+      count8x8_lp += counts->tx_size_implied[3][TX_8X8];
+      count8x8_8x8p += counts->tx_size_implied[1][TX_8X8];
+      count16x16_lp += counts->tx_size_implied[3][TX_16X16];
+      count16x16_16x16p += counts->tx_size_implied[2][TX_16X16];
+      count32x32 += counts->tx_size_implied[3][TX_32X32];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+#if CONFIG_SUPERTX
+          cm->counts.supertx_size[TX_16X16] == 0 &&
+          cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
           count32x32 == 0) {
         cm->tx_mode = ALLOW_8X8;
         reset_skip_tx_size(cm, TX_8X8);
       } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+                 count8x8_lp == 0 && count16x16_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_8X8] == 0 &&
+                 cm->counts.supertx_size[TX_16X16] == 0 &&
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count32x32 == 0) {
         cm->tx_mode = ONLY_4X4;
         reset_skip_tx_size(cm, TX_4X4);
       } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
         cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+      } else if (count32x32 == 0 && count8x8_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count4x4 == 0) {
         cm->tx_mode = ALLOW_16X16;
         reset_skip_tx_size(cm, TX_16X16);
       }
     }
+#endif
   } else {
-    cm->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
   }
 }
 
+static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi,
+                            const MODE_INFO *above_mi, const MODE_INFO *left_mi,
+                            const int intraonly) {
+  const PREDICTION_MODE y_mode = mi->mbmi.mode;
+  const PREDICTION_MODE uv_mode = mi->mbmi.uv_mode;
+  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
+
+  if (bsize < BLOCK_8X8) {
+    int idx, idy;
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+    for (idy = 0; idy < 2; idy += num_4x4_h)
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        const int bidx = idy * 2 + idx;
+        const PREDICTION_MODE bmode = mi->bmi[bidx].as_mode;
+        if (intraonly) {
+          const PREDICTION_MODE a = av1_above_block_mode(mi, above_mi, bidx);
+          const PREDICTION_MODE l = av1_left_block_mode(mi, left_mi, bidx);
+          ++counts->kf_y_mode[a][l][bmode];
+        } else {
+          ++counts->y_mode[0][bmode];
+        }
+      }
+  } else {
+    if (intraonly) {
+      const PREDICTION_MODE above = av1_above_block_mode(mi, above_mi, 0);
+      const PREDICTION_MODE left = av1_left_block_mode(mi, left_mi, 0);
+      ++counts->kf_y_mode[above][left][y_mode];
+    } else {
+      ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+    }
+  }
+
+  ++counts->uv_mode[y_mode][uv_mode];
+}
+
+#if CONFIG_VAR_TX
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+                              int blk_row, int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   mbmi->sb_type, tx_size);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    ++counts->txfm_partition[ctx][0];
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bs = tx_size_wide_unit[sub_txs];
+    int i;
+
+    ++counts->txfm_partition[ctx][1];
+    ++x->txb_split_count;
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bs;
+      int offsetc = (i & 0x01) * bs;
+      update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+                        blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+                                      BLOCK_SIZE plane_bsize, int mi_row,
+                                      int mi_col, FRAME_COUNTS *td_counts) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bw)
+      update_txfm_count(x, xd, td_counts, max_tx_size, mi_width != mi_height,
+                        idy, idx);
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+                             int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+  const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bsl;
+      int offsetc = (i & 0x01) * bsl;
+      set_txfm_context(xd, sub_txs, blk_row + offsetr, blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+                                      MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
+                                      int mi_row, int mi_col) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  const int bh = tx_size_high_unit[max_tx_size];
+  const int bw = tx_size_wide_unit[max_tx_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bw)
+      set_txfm_context(xd, max_tx_size, idy, idx);
+}
+#endif
+
 static void encode_superblock(const AV1_COMP *const cpi, ThreadData *td,
-                              TOKENEXTRA **t, int output_enabled, int mi_row,
+                              TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
                               int mi_col, BLOCK_SIZE bsize,
-                              PICK_MODE_CONTEXT *ctx) {
+                              PICK_MODE_CONTEXT *ctx, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0];
+  MODE_INFO **mi_8x8 = xd->mi;
+  MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
-  MV_REF *const frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
-  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
-  const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
-  const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
   const int seg_skip =
       segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  int w, h;
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
 
 #if CONFIG_PVQ
   x->pvq_speed = 0;
-  x->pvq_coded = output_enabled ? 1 : 0;
+  x->pvq_coded = !dry_run ? 1 : 0;
 #endif
 
   if (!is_inter_block(mbmi)) {
@@ -3210,24 +5267,68 @@
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       av1_encode_intra_block_plane((AV1_COMMON *)cm, x,
-                                   AOMMAX(bsize, BLOCK_8X8), plane);
+                                   AOMMAX(bsize, BLOCK_8X8), plane, 1);
+    if (!dry_run)
+      sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
+                      frame_is_intra_only(cm));
+
+    // TODO(huisu): move this into sum_intra_stats().
+    if (!dry_run && bsize >= BLOCK_8X8) {
+      FRAME_COUNTS *counts = td->counts;
+      (void)counts;
+#if CONFIG_FILTER_INTRA
+      if (mbmi->mode == DC_PRED
 #if CONFIG_PALETTE
-    if (bsize >= BLOCK_8X8 && output_enabled) {
+          && mbmi->palette_mode_info.palette_size[0] == 0
+#endif  // CONFIG_PALETTE
+          ) {
+        const int use_filter_intra_mode =
+            mbmi->filter_intra_mode_info.use_filter_intra_mode[0];
+        ++counts->filter_intra[0][use_filter_intra_mode];
+      }
+      if (mbmi->uv_mode == DC_PRED
+#if CONFIG_PALETTE
+          && mbmi->palette_mode_info.palette_size[1] == 0
+#endif  // CONFIG_PALETTE
+          ) {
+        const int use_filter_intra_mode =
+            mbmi->filter_intra_mode_info.use_filter_intra_mode[1];
+        ++counts->filter_intra[1][use_filter_intra_mode];
+      }
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_EXT_INTRA
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) {
+        int p_angle;
+        const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+        p_angle =
+            mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+        if (av1_is_intra_filter_switchable(p_angle))
+          ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter];
+      }
+#endif  // CONFIG_EXT_INTRA
+    }
+
+#if CONFIG_PALETTE
+    if (bsize >= BLOCK_8X8 && !dry_run) {
       for (plane = 0; plane <= 1; ++plane) {
         if (mbmi->palette_mode_info.palette_size[plane] > 0) {
           mbmi->palette_mode_info.palette_first_color_idx[plane] =
               xd->plane[plane].color_index_map[0];
           // TODO(huisu): this increases the use of token buffer. Needs stretch
           // test to verify.
-          av1_tokenize_palette_sb(td, bsize, plane, t);
+          av1_tokenize_palette_sb(cpi, td, plane, t, dry_run, bsize, rate);
         }
       }
     }
 #endif  // CONFIG_PALETTE
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+#if CONFIG_VAR_TX
+    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
+
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
@@ -3243,29 +5344,1103 @@
                                     AOMMAX(bsize, BLOCK_8X8));
 
 #if CONFIG_MOTION_VAR
-    if (mbmi->motion_mode == OBMC_CAUSAL)
+    if (mbmi->motion_mode == OBMC_CAUSAL) {
       av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+    }
 #endif  // CONFIG_MOTION_VAR
 
     av1_encode_sb((AV1_COMMON *)cm, x, AOMMAX(bsize, BLOCK_8X8));
-    av1_tokenize_sb(cpi, td, t, !output_enabled, AOMMAX(bsize, BLOCK_8X8));
+#if CONFIG_VAR_TX
+    if (mbmi->skip) mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    if (is_rect_tx(mbmi->tx_size))
+      av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
+    else
+#endif
+      av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col,
+                            AOMMAX(bsize, BLOCK_8X8), rate);
+#else
+    av1_tokenize_sb(cpi, td, t, dry_run, AOMMAX(bsize, BLOCK_8X8), rate);
+#endif
   }
 
-  if (output_enabled) {
-    update_stats(cpi, td, ctx, mi_row, mi_col, bsize);
-    for (h = 0; h < y_mis; ++h) {
-      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
-      for (w = 0; w < x_mis; ++w) {
-        MV_REF *const mv = frame_mv + w;
-        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
-        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
-        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
-        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
-#if CONFIG_REF_MV
-        mv->pred_mv[0].as_int = mi->mbmi.pred_mv[0].as_int;
-        mv->pred_mv[1].as_int = mi->mbmi.pred_mv[1].as_int;
+  if (!dry_run) {
+    const int is_inter = is_inter_block(mbmi);
+#if CONFIG_VAR_TX
+    TX_SIZE tx_size =
+        is_inter && !mbmi->skip ? mbmi->min_tx_size : mbmi->tx_size;
+#else
+    TX_SIZE tx_size = mbmi->tx_size;
 #endif
+    if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
+        !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
+      const int tx_size_ctx = get_tx_size_context(xd);
+      const int tx_size_cat = is_inter ? inter_tx_size_cat_lookup[bsize]
+                                       : intra_tx_size_cat_lookup[bsize];
+      const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+      const int depth = tx_size_to_depth(coded_tx_size);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      if (is_rect_tx_allowed(xd, mbmi)) {
+        td->counts->rect_tx[tx_size_cat][is_rect_tx(tx_size)]++;
+      }
+      if (!is_rect_tx_allowed(xd, mbmi) || !is_rect_tx(tx_size)) {
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+        if (is_inter) {
+          tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts);
+        } else {
+          ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+          if (tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+        }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+      }
+#endif
+#endif
+#if !CONFIG_VAR_TX
+      ++td->counts->tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+    } else {
+      int i, j;
+      TX_SIZE intra_tx_size;
+      // The new intra coding scheme requires no change of transform size
+      if (is_inter_block(&mi->mbmi)) {
+        if (xd->lossless[mbmi->segment_id]) {
+          intra_tx_size = TX_4X4;
+        } else {
+          intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
+        }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+        ++td->counts->tx_size_implied[max_txsize_lookup[bsize]]
+                                     [txsize_sqr_up_map[tx_size]];
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+      } else {
+        intra_tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4;
+      }
+
+      for (j = 0; j < mi_height; j++)
+        for (i = 0; i < mi_width; i++)
+          if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
+            mi_8x8[mis * j + i]->mbmi.tx_size = intra_tx_size;
+
+#if CONFIG_VAR_TX
+      mbmi->min_tx_size = get_min_tx_size(intra_tx_size);
+      if (intra_tx_size != max_txsize_lookup[bsize]) ++x->txb_split_count;
+#endif
+    }
+
+    ++td->counts->tx_size_totals[txsize_sqr_map[tx_size]];
+    ++td->counts
+          ->tx_size_totals[txsize_sqr_map[get_uv_tx_size(mbmi, &xd->plane[1])]];
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(tx_size, bsize, is_inter_block(mbmi)) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int eset = get_ext_tx_set(tx_size, bsize, is_inter_block(mbmi));
+      if (eset > 0) {
+        if (is_inter_block(mbmi)) {
+          ++td->counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+                                    [mbmi->tx_type];
+        } else {
+          ++td->counts->intra_ext_tx[eset][tx_size][mbmi->mode][mbmi->tx_type];
+        }
       }
     }
+#else
+    if (tx_size < TX_32X32 && cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      if (is_inter_block(mbmi)) {
+        ++td->counts->inter_ext_tx[tx_size][mbmi->tx_type];
+      } else {
+        ++td->counts->intra_ext_tx[tx_size]
+                                  [intra_mode_to_tx_type_context[mbmi->mode]]
+                                  [mbmi->tx_type];
+      }
+    }
+#endif  // CONFIG_EXT_TX
+  }
+
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
+      is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    if (is_rect_tx(mbmi->tx_size)) {
+      set_txfm_ctxs(mbmi->tx_size, xd->n8_w, xd->n8_h, xd);
+    } else {
+      if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+    }
+#else
+    if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  } else {
+    TX_SIZE tx_size = mbmi->tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter_block(mbmi))
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    {
+      tx_size = AOMMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                       max_txsize_lookup[bsize]);
+      if (txsize_sqr_map[max_txsize_rect_lookup[bsize]] <= tx_size)
+        tx_size = max_txsize_rect_lookup[bsize];
+      if (xd->lossless[mbmi->segment_id]) tx_size = TX_4X4;
+    }
+#else
+      tx_size = AOMMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                       max_txsize_lookup[bsize]);
+#endif
+    else
+      tx_size = (bsize >= BLOCK_8X8) ? tx_size : TX_4X4;
+    mbmi->tx_size = tx_size;
+    set_txfm_ctxs(tx_size, xd->n8_w, xd->n8_h, xd);
+  }
+#endif
+}
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
+  if (!is_inter_mode((&ctx->mic)->mbmi.mode)) return 1;
+#if CONFIG_EXT_INTER
+  if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME) return 1;
+#endif  // CONFIG_EXT_INTER
+  return 0;
+}
+
+static int check_intra_sb(const AV1_COMP *const cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return 1;
+
+  switch (partition) {
+    case PARTITION_NONE: return check_intra_b(&pc_tree->none); break;
+    case PARTITION_VERT:
+      if (check_intra_b(&pc_tree->vertical[0])) return 1;
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        if (check_intra_b(&pc_tree->vertical[1])) return 1;
+      }
+      break;
+    case PARTITION_HORZ:
+      if (check_intra_b(&pc_tree->horizontal[0])) return 1;
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        if (check_intra_b(&pc_tree->horizontal[1])) return 1;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        if (check_intra_b(pc_tree->leaf_split[0])) return 1;
+      } else {
+        if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize,
+                           pc_tree->split[0]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize,
+                           pc_tree->split[1]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize,
+                           pc_tree->split[2]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize,
+                           pc_tree->split[3]))
+          return 1;
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontala[i])) return 1;
+      }
+      break;
+    case PARTITION_HORZ_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontalb[i])) return 1;
+      }
+      break;
+    case PARTITION_VERT_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticala[i])) return 1;
+      }
+      break;
+    case PARTITION_VERT_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticalb[i])) return 1;
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+  return 0;
+}
+
+static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) {
+  return ctx->mic.mbmi.tx_size == supertx_size;
+}
+
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree) {
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  partition = pc_tree->partitioning;
+  subsize = get_subsize(bsize, partition);
+  switch (partition) {
+    case PARTITION_NONE: return check_supertx_b(supertx_size, &pc_tree->none);
+    case PARTITION_VERT:
+      return check_supertx_b(supertx_size, &pc_tree->vertical[0]);
+    case PARTITION_HORZ:
+      return check_supertx_b(supertx_size, &pc_tree->horizontal[0]);
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8)
+        return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
+      else
+        return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
+    case PARTITION_HORZ_B:
+      return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]);
+    case PARTITION_VERT_A:
+      return check_supertx_b(supertx_size, &pc_tree->verticala[0]);
+    case PARTITION_VERT_B:
+      return check_supertx_b(supertx_size, &pc_tree->verticalb[0]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0); return 0;
   }
 }
+
+static void predict_superblock(const AV1_COMP *const cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+                               int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi_8x8 = xd->mi[0];
+  MODE_INFO *mi = mi_8x8;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int ref;
+  const int is_compound = has_second_ref(mbmi);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+    av1_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
+                         &xd->block_refs[ref]->sf);
+  }
+
+  if (!b_sub8x8)
+    av1_build_inter_predictors_sb_extend(xd,
+#if CONFIG_EXT_INTER
+                                         mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                         mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    av1_build_inter_predictors_sb_sub8x8_extend(xd,
+#if CONFIG_EXT_INTER
+                                                mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                                mi_row_pred, mi_col_pred,
+                                                bsize_pred, block);
+}
+
+static void predict_b_extend(const AV1_COMP *const cpi, ThreadData *td,
+                             const TileInfo *const tile, int block,
+                             int mi_row_ori, int mi_col_ori, int mi_row_pred,
+                             int mi_col_pred, int mi_row_top, int mi_col_top,
+                             uint8_t *dst_buf[3], int dst_stride[3],
+                             BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
+                             RUN_TYPE dry_run, int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+
+  MACROBLOCK *const x = &td->mb;
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top || mi_row_pred >= cm->mi_rows ||
+      mi_col_pred >= cm->mi_cols)
+    return;
+
+  set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred, mi_row_ori,
+                     mi_col_ori, bsize_pred);
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  predict_superblock(cpi, td,
+#if CONFIG_EXT_INTER
+                     mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                     mi_row_pred, mi_col_pred, bsize_pred, b_sub8x8, block);
+
+  if (!dry_run && !bextend) {
+#if CONFIG_SUPERTX
+    update_stats(&cpi->common, td, mi_row_pred, mi_col_pred, 1);
+#else
+    update_stats(&cpi->common, td, mi_row_pred, mi_col_pred);
+#endif
+  }
+}
+
+static void extend_dir(const AV1_COMP *const cpi, ThreadData *td,
+                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
+                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
+                       uint8_t *dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  MACROBLOCKD *xd = &td->mb.e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {  // lower and upper
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss)
+                       ? BLOCK_8X8
+                       : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
+                     mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      for (i = 0; i < mi_width / unit - 1; i++) {
+        mi_col_pred += unit;
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
+                         mi_col_pred, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {  // left and right
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss)
+                       ? BLOCK_8X8
+                       : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
+                     mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height / unit - 1; i++) {
+        mi_row_pred += unit;
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
+                         mi_col_pred, mi_row_top, mi_col_top, dst_buf,
+                         dst_stride, top_bsize, extend_bsize, dry_run, b_sub8x8,
+                         1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col, mi_row_pred,
+                     mi_col_pred, mi_row_top, mi_col_top, dst_buf, dst_stride,
+                     top_bsize, extend_bsize, dry_run, b_sub8x8, 1);
+  }
+}
+
+static void extend_all(const AV1_COMP *const cpi, ThreadData *td,
+                       const TileInfo *const tile, int block, BLOCK_SIZE bsize,
+                       BLOCK_SIZE top_bsize, int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top, RUN_TYPE dry_run,
+                       uint8_t *dst_buf[3], int dst_stride[3]) {
+  assert(block >= 0 && block < 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 0);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 1);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 2);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 3);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 5);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 6);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+             mi_col_top, dry_run, dst_buf, dst_stride, 7);
+}
+
+// This function generates prediction for multiple blocks, between which
+// discontinuity around boundary is reduced by smoothing masks. The basic
+// smoothing mask is a soft step function along horz/vert direction. In more
+// complicated case when a block is split into 4 subblocks, the basic mask is
+// first applied to neighboring subblocks (2 pairs) in horizontal direction and
+// then applied to the 2 masked prediction mentioned above in vertical direction
+// If the block is split into more than one level, at every stage, masked
+// prediction is stored in dst_buf[] passed from higher level.
+static void predict_sb_complex(const AV1_COMP *const cpi, ThreadData *td,
+                               const TileInfo *const tile, int mi_row,
+                               int mi_col, int mi_row_top, int mi_col_top,
+                               RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize, uint8_t *dst_buf[3],
+                               int dst_stride[3], PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+  int i;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride2[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+  int dst_stride3[3] = { MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE };
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+  } else {
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
+#if CONFIG_AOM_HIGHBITDEPTH
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  if (!dry_run && bsize < top_bsize) {
+    // Explicitly cast away const.
+    FRAME_COUNTS *const frame_counts = (FRAME_COUNTS *)&cm->counts;
+    frame_counts->partition[ctx][partition]++;
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       bsize, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col, mi_row_top,
+                 mi_col_top, dry_run, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8) {
+        // Fisrt half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         BLOCK_8X8, dry_run, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            0);
+      } else {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         subsize, dry_run, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1, 1);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8) {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         BLOCK_8X8, dry_run, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[0], dst_stride[0], dst_buf1[0], dst_stride1[0], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            0);
+      } else {
+        // bsize: not important, not useful
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         subsize, dry_run, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
+
+        if (mi_col + hbs < cm->mi_cols) {
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                           dst_stride1, top_bsize, subsize, dry_run, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf1,
+                       dst_stride1, 2);
+
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                         BLOCK_8X8, dry_run, 1, 0);
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+        predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                         top_bsize, BLOCK_8X8, dry_run, 1, 1);
+
+        if (bsize < top_bsize) {
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+          extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dry_run, dst_buf3, dst_stride3);
+        }
+      } else {
+        predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row_top,
+                           mi_col_top, dry_run, subsize, top_bsize, dst_buf,
+                           dst_stride, pc_tree->split[0]);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs, mi_row_top,
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf1,
+                             dst_stride1, pc_tree->split[1]);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col, mi_row_top,
+                             mi_col_top, dry_run, subsize, top_bsize, dst_buf2,
+                             dst_stride2, pc_tree->split[2]);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                             mi_row_top, mi_col_top, dry_run, subsize,
+                             top_bsize, dst_buf3, dst_stride3,
+                             pc_tree->split[3]);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        if (bsize == BLOCK_8X8 && i != 0)
+          continue;  // Skip <4x4 chroma smoothing
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_VERT, i);
+          if (mi_row + hbs < cm->mi_rows) {
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf2[i], dst_stride2[i], dst_buf3[i], dst_stride3[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_VERT, i);
+            av1_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+                mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+                PARTITION_HORZ, i);
+          }
+        } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+          av1_build_masked_inter_predictor_complex(
+              xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i],
+              mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+              PARTITION_HORZ, i);
+        }
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       top_bsize, subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+
+      break;
+    case PARTITION_VERT_A:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2, top_bsize, subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, dry_run, dst_buf2, dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf2[i], dst_stride2[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 0);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_HORZ,
+            i);
+      }
+      break;
+    case PARTITION_VERT_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, top_bsize,
+                       subsize, dry_run, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, dry_run, dst_buf, dst_stride, 3);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                       dst_stride1, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, dry_run, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, top_bsize, bsize2, dry_run, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, dry_run, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf1[i], dst_stride1[i], dst_buf2[i], dst_stride2[i],
+            mi_row, mi_col, mi_row_top, mi_col_top, bsize, top_bsize,
+            PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        av1_build_masked_inter_predictor_complex(
+            xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i], mi_row,
+            mi_col, mi_row_top, mi_col_top, bsize, top_bsize, PARTITION_VERT,
+            i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default: assert(0);
+  }
+
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize < top_bsize)
+    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+  if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+}
+
+static void rd_supertx_sb(const AV1_COMP *const cpi, ThreadData *td,
+                          const TileInfo *const tile, int mi_row, int mi_col,
+                          BLOCK_SIZE bsize, int *tmp_rate, int64_t *tmp_dist,
+                          TX_TYPE *best_tx, PC_TREE *pc_tree) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate,
+      base_rate = *tmp_rate;
+  int64_t sse, pnsse, sse_uv, this_dist, dist_uv;
+  uint8_t *dst_buf[3];
+  int dst_stride[3];
+  TX_SIZE tx_size;
+  MB_MODE_INFO *mbmi;
+  TX_TYPE tx_type, best_tx_nostx;
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+  int tmp_rate_tx = 0, skip_tx = 0;
+  int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
+
+  set_skip_context(xd, mi_row, mi_col);
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize, 1, pc_tree);
+  av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    dst_buf[plane] = xd->plane[plane].dst.buf;
+    dst_stride[plane] = xd->plane[plane].dst.stride;
+  }
+  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col, 1, bsize,
+                     bsize, dst_buf, dst_stride, pc_tree);
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+  mbmi = &xd->mi[0]->mbmi;
+  best_tx_nostx = mbmi->tx_type;
+
+  *best_tx = DCT_DCT;
+
+  // chroma
+  skippable_uv = 1;
+  rate_uv = 0;
+  dist_uv = 0;
+  sse_uv = 0;
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    int coeff_ctx = 1;
+    RD_STATS this_rd_stats;
+    av1_init_rd_stats(&this_rd_stats);
+
+    tx_size = max_txsize_lookup[bsize];
+    tx_size =
+        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+    coeff_ctx = combine_entropy_contexts(ctxa[0], ctxl[0]);
+
+    av1_subtract_plane(x, bsize, plane);
+    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, plane, 0,
+                      get_plane_block_size(bsize, pd), coeff_ctx,
+                      &this_rd_stats);
+
+    this_rate = this_rd_stats.rate;
+    this_dist = this_rd_stats.dist;
+    pnsse = this_rd_stats.sse;
+    pnskip = this_rd_stats.skip;
+#else
+    tx_size = max_txsize_lookup[bsize];
+    tx_size =
+        uv_txsize_lookup[bsize][tx_size][cm->subsampling_x][cm->subsampling_y];
+    av1_subtract_plane(x, bsize, plane);
+    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
+                                 &pnsse, INT64_MAX, plane, bsize, tx_size, 0);
+#endif  // CONFIG_VAR_TX
+
+    rate_uv += this_rate;
+    dist_uv += this_dist;
+    sse_uv += pnsse;
+    skippable_uv &= pnskip;
+  }
+
+  // luma
+  tx_size = max_txsize_lookup[bsize];
+  av1_subtract_plane(x, bsize, 0);
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(tx_size, bsize, 1);
+#endif  // CONFIG_EXT_TX
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+#if CONFIG_VAR_TX
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    int coeff_ctx = 1;
+    RD_STATS this_rd_stats;
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_EXT_TX
+    if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+#else
+    if (tx_size >= TX_32X32 && tx_type != DCT_DCT) continue;
+#endif  // CONFIG_EXT_TX
+    mbmi->tx_type = tx_type;
+
+#if CONFIG_VAR_TX
+    av1_init_rd_stats(&this_rd_stats);
+
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+    coeff_ctx = combine_entropy_contexts(ctxa[0], ctxl[0]);
+    av1_tx_block_rd_b(cpi, x, tx_size, 0, 0, 0, 0, bsize, coeff_ctx,
+                      &this_rd_stats);
+
+    this_rate = this_rd_stats.rate;
+    this_dist = this_rd_stats.dist;
+    pnsse = this_rd_stats.sse;
+    pnskip = this_rd_stats.skip;
+#else
+    av1_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
+                                 &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(tx_size, bsize, 1) > 1 &&
+        !xd->lossless[xd->mi[0]->mbmi.segment_id] && this_rate != INT_MAX) {
+      if (ext_tx_set > 0)
+        this_rate +=
+            cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->tx_type];
+    }
+#else
+    if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+        this_rate != INT_MAX) {
+      this_rate += cpi->inter_tx_type_costs[tx_size][mbmi->tx_type];
+    }
+#endif  // CONFIG_EXT_TX
+    *tmp_rate = rate_uv + this_rate;
+    *tmp_dist = dist_uv + this_dist;
+    sse = sse_uv + pnsse;
+    skippable = skippable_uv && pnskip;
+    if (skippable) {
+      *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      x->skip = 1;
+    } else {
+      if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist) <
+          RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+        *tmp_rate += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+        x->skip = 0;
+      } else {
+        *tmp_dist = sse;
+        *tmp_rate = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+        x->skip = 1;
+      }
+    }
+    *tmp_rate += base_rate;
+    rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist);
+    if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
+      *best_tx = tx_type;
+      bestrd_tx = rd_tx;
+      tmp_rate_tx = *tmp_rate;
+      tmp_dist_tx = *tmp_dist;
+      skip_tx = x->skip;
+    }
+  }
+  *tmp_rate = tmp_rate_tx;
+  *tmp_dist = tmp_dist_tx;
+  x->skip = skip_tx;
+#if CONFIG_VAR_TX
+  for (plane = 0; plane < 1; ++plane)
+    memset(x->blk_skip[plane], x->skip,
+           sizeof(uint8_t) * pc_tree->none.num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+  xd->mi[0]->mbmi.tx_type = best_tx_nostx;
+}
+#endif  // CONFIG_SUPERTX

diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index bc75735..dfde235 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c

@@ -24,6 +24,7 @@
 
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/quantize.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/tokenize.h"
 
@@ -33,11 +34,6 @@
 #include "av1/encoder/pvq_encoder.h"
 #endif
 
-struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
-};
-
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -57,89 +53,89 @@
                      pd->dst.buf, pd->dst.stride);
 }
 
-#define RDTRUNC(RM, DM, R, D)                        \
-  (((1 << (AV1_PROB_COST_SHIFT - 1)) + (R) * (RM)) & \
-   ((1 << AV1_PROB_COST_SHIFT) - 1))
-
 typedef struct av1_token_state {
   int rate;
-  int error;
+  int64_t error;
   int next;
   int16_t token;
-  short qc;
+  tran_low_t qc;
+  tran_low_t dqc;
 } av1_token_state;
 
-#if !CONFIG_PVQ
-// TODO(jimbankoski): experiment to find optimal RD numbers.
-static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+  { 10, 6 }, { 8, 5 },
+};
 
-#define UPDATE_RD_COST()                                \
-  {                                                     \
-    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);    \
-    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);    \
-    if (rd_cost0 == rd_cost1) {                         \
-      rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0); \
-      rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1); \
-    }                                                   \
+#define UPDATE_RD_COST()                             \
+  {                                                  \
+    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
+    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
   }
 
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,
-                                     int idx, int token, uint8_t *token_cache) {
-  int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = av1_pt_energy_class[token];
-  pt = get_coef_context(nb, token_cache, idx + 1);
-  token_cache[scan[idx]] = bak;
-  return pt;
-}
-
-static int optimize_b(const AV1_COMMON *const cm, MACROBLOCK *mb, int plane,
-                      int block, TX_SIZE tx_size, int ctx) {
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+                   TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  av1_token_state tokens[1025][2];
-  unsigned best_index[1025][2];
-  uint8_t token_cache[1024];
+  av1_token_state tokens[MAX_TX_SQUARE + 1][2];
+  unsigned best_index[MAX_TX_SQUARE + 1][2];
+  uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
-  const PLANE_TYPE type = pd->plane_type;
-  const int default_eob = 1 << (tx_size_1d_log2[tx_size] * 2);
-  const int mul = 1 + (tx_size == TX_32X32);
-#if CONFIG_AOM_QM
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
-#endif
-  const int16_t *dequant_ptr = pd->dequant;
+  const PLANE_TYPE plane_type = pd->plane_type;
+  const int default_eob = tx_size_2d[tx_size];
+  const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  TX_TYPE tx_type = get_tx_type(type, xd, block);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
   const int16_t *const scan = scan_order->scan;
   const int16_t *const nb = scan_order->neighbors;
+#if CONFIG_AOM_QM
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!ref][tx_size];
+#endif
+  const int shift = get_tx_scale(xd, tx_type, tx_size);
+#if CONFIG_NEW_QUANT
+  int dq = get_dq_profile_from_ctx(mb->qindex, ctx, ref, plane_type);
+  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
+#else
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
+#endif  // CONFIG_NEW_QUANT
   int next = eob, sz = 0;
-  int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
+  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+  const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1;
+  int rate0, rate1;
+  int64_t error0, error1;
   int16_t t0, t1;
-  EXTRABIT e0;
-  int best, band, pt, i, final_eob;
+  int best, band = (eob < default_eob) ? band_translate[eob]
+                                       : band_translate[eob - 1];
+  int pt, i, final_eob;
 #if CONFIG_AOM_HIGHBITDEPTH
   const int *cat6_high_cost = av1_get_high_cost_table(xd->bd);
 #else
   const int *cat6_high_cost = av1_get_high_cost_table(8);
 #endif
+  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[txsize_sqr_map[tx_size]][plane_type][ref];
+  const uint16_t *band_counts = &band_count_table[tx_size][band];
+  uint16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
+  int shortcut = 0;
+  int next_shortcut = 0;
 
-  assert((!type && !plane) || (type && plane));
+  assert((mb->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+  token_costs += band;
+
+  assert((!plane_type && !plane) || (plane_type && plane));
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  if (!ref) rdmult = (rdmult * 9) >> 4;
-
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
@@ -148,74 +144,110 @@
   tokens[eob][0].qc = 0;
   tokens[eob][1] = tokens[eob][0];
 
-  for (i = 0; i < eob; i++)
-    token_cache[scan[i]] = av1_pt_energy_class[av1_get_token(qcoeff[scan[i]])];
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    tokens[i][0].rate = av1_get_token_cost(qcoeff[rc], &t0, cat6_high_cost);
+    tokens[i][0].token = t0;
+    token_cache[rc] = av1_pt_energy_class[t0];
+  }
 
   for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
-
+    int base_bits, dx;
+    int64_t d2;
     const int rc = scan[i];
 #if CONFIG_AOM_QM
     int iwt = iqmatrix[rc];
 #endif
     int x = qcoeff[rc];
+    next_shortcut = shortcut;
+
     /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      int shortcut = 0;
+    if (UNLIKELY(x)) {
       error0 = tokens[next][0].error;
       error1 = tokens[next][1].error;
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      av1_get_token_extra(x, &t0, &e0);
-      /* Consider both possible successor states. */
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][0].token];
-        rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][1].token];
+
+      if (next_shortcut) {
+        /* Consider both possible successor states. */
+        if (next < default_eob) {
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 += (*token_costs)[0][pt][tokens[next][0].token];
+          rate1 += (*token_costs)[0][pt][tokens[next][1].token];
+        }
+        UPDATE_RD_COST();
+        /* And pick the best. */
+        best = rd_cost1 < rd_cost0;
+      } else {
+        if (next < default_eob) {
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 += (*token_costs)[0][pt][tokens[next][0].token];
+        }
+        best = 0;
       }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = av1_get_cost(t0, e0, cat6_high_cost);
-      dx = mul * (dqcoeff[rc] - coeff[rc]);
+
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
 #if CONFIG_AOM_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         dx >>= xd->bd - 8;
       }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-      d2 = dx * dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+      d2 = (int64_t)dx * dx;
+      tokens[i][0].rate += (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
       tokens[i][0].next = next;
-      tokens[i][0].token = t0;
       tokens[i][0].qc = x;
+      tokens[i][0].dqc = dqcoeff[rc];
       best_index[i][0] = best;
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-#if CONFIG_AOM_QM
-      if ((abs(x) * dequant_ptr[rc != 0] * iwt >
-           ((abs(coeff[rc]) * mul) << AOM_QM_BITS)) &&
-          (abs(x) * dequant_ptr[rc != 0] * iwt <
-           ((abs(coeff[rc]) * mul + dequant_ptr[rc != 0]) << AOM_QM_BITS)))
-#else
-      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
-          (abs(x) * dequant_ptr[rc != 0] <
-           abs(coeff[rc]) * mul + dequant_ptr[rc != 0]))
-#endif
-        shortcut = 1;
-      else
+      // The threshold of 3 is empirically obtained.
+      if (UNLIKELY(abs(x) > 3)) {
         shortcut = 0;
+      } else {
+#if CONFIG_NEW_QUANT
+        shortcut = ((av1_dequant_abscoeff_nuq(abs(x), dequant_ptr[rc != 0],
+                                              dequant_val[band_translate[i]]) >
+                     (abs(coeff[rc]) << shift)) &&
+                    (av1_dequant_abscoeff_nuq(abs(x) - 1, dequant_ptr[rc != 0],
+                                              dequant_val[band_translate[i]]) <
+                     (abs(coeff[rc]) << shift)));
+#else  // CONFIG_NEW_QUANT
+#if CONFIG_AOM_QM
+        if ((abs(x) * dequant_ptr[rc != 0] * iwt >
+             ((abs(coeff[rc]) << shift) << AOM_QM_BITS)) &&
+            (abs(x) * dequant_ptr[rc != 0] * iwt <
+             (((abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])
+              << AOM_QM_BITS)))
+#else
+        if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+            (abs(x) * dequant_ptr[rc != 0] <
+             (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0]))
+#endif  // CONFIG_AOM_QM
+          shortcut = 1;
+        else
+          shortcut = 0;
+#endif  // CONFIG_NEW_QUANT
+      }
 
       if (shortcut) {
         sz = -(x < 0);
         x -= 2 * sz + 1;
+      } else {
+        tokens[i][1] = tokens[i][0];
+        best_index[i][1] = best_index[i][0];
+        next = i;
+
+        if (UNLIKELY(!(--band_left))) {
+          --band_counts;
+          band_left = *band_counts;
+          --token_costs;
+        }
+        continue;
       }
 
       /* Consider both possible successor states. */
@@ -225,48 +257,91 @@
          */
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        e0 = 0;
+        base_bits = 0;
       } else {
-        av1_get_token_extra(x, &t0, &e0);
+        base_bits = av1_get_token_cost(x, &t0, cat6_high_cost);
         t1 = t0;
       }
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        if (t0 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][0].token];
+
+      if (next_shortcut) {
+        if (LIKELY(next < default_eob)) {
+          if (t0 != EOB_TOKEN) {
+            token_cache[rc] = av1_pt_energy_class[t0];
+            pt = get_coef_context(nb, token_cache, i + 1);
+            rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
+          }
+          if (t1 != EOB_TOKEN) {
+            token_cache[rc] = av1_pt_energy_class[t1];
+            pt = get_coef_context(nb, token_cache, i + 1);
+            rate1 += (*token_costs)[!x][pt][tokens[next][1].token];
+          }
         }
-        if (t1 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][1].token];
+
+        UPDATE_RD_COST();
+        /* And pick the best. */
+        best = rd_cost1 < rd_cost0;
+      } else {
+        // The two states in next stage are identical.
+        if (next < default_eob && t0 != EOB_TOKEN) {
+          token_cache[rc] = av1_pt_energy_class[t0];
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
         }
+        best = 0;
       }
 
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = av1_get_cost(t0, e0, cat6_high_cost);
-
-      if (shortcut) {
+#if CONFIG_NEW_QUANT
+      dx = av1_dequant_coeff_nuq(x, dequant_ptr[rc != 0],
+                                 dequant_val[band_translate[i]]) -
+           (coeff[rc] << shift);
 #if CONFIG_AOM_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
-        } else {
-          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-        }
-#else
-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-        d2 = dx * dx;
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
       }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#else   // CONFIG_NEW_QUANT
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+      } else {
+        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+      }
+#else
+      dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // CONFIG_NEW_QUANT
+      d2 = (int64_t)dx * dx;
 
       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][1].error = d2 + (best ? error1 : error0);
       tokens[i][1].next = next;
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
+
+      if (x) {
+#if CONFIG_NEW_QUANT
+        tokens[i][1].dqc = av1_dequant_abscoeff_nuq(
+            abs(x), dequant_ptr[rc != 0], dequant_val[band_translate[i]]);
+        tokens[i][1].dqc = shift ? ROUND_POWER_OF_TWO(tokens[i][1].dqc, shift)
+                                 : tokens[i][1].dqc;
+        if (sz) tokens[i][1].dqc = -tokens[i][1].dqc;
+#else
+        tran_low_t offset = dq_step[rc != 0];
+        // The 32x32 transform coefficient uses half quantization step size.
+        // Account for the rounding difference in the dequantized coefficeint
+        // value when the quantization index is dropped from an even number
+        // to an odd number.
+        if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);
+
+        if (sz == 0)
+          tokens[i][1].dqc = dqcoeff[rc] - offset;
+        else
+          tokens[i][1].dqc = dqcoeff[rc] + offset;
+#endif  // CONFIG_NEW_QUANT
+      } else {
+        tokens[i][1].dqc = 0;
+      }
+
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
@@ -274,58 +349,50 @@
       /* There's no choice to make for a zero coefficient, so we don't
        *  add a new trellis node, but we do need to update the costs.
        */
-      band = band_translate[i + 1];
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
+      pt = get_coef_context(nb, token_cache, i + 1);
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][0][t0];
+        tokens[next][0].rate += (*token_costs)[1][pt][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][0][t1];
+        tokens[next][1].rate += (*token_costs)[1][pt][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
       best_index[i][0] = best_index[i][1] = 0;
+      shortcut = (tokens[next][0].rate != tokens[next][1].rate);
       /* Don't update next, because we didn't add a new node. */
     }
+
+    if (UNLIKELY(!(--band_left))) {
+      --band_counts;
+      band_left = *band_counts;
+      --token_costs;
+    }
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = band_translate[i + 1];
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
   error0 = tokens[next][0].error;
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
+  rate0 += (*token_costs)[0][ctx][t0];
+  rate1 += (*token_costs)[0][ctx][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
+
   final_eob = -1;
-  memset(qcoeff, 0, sizeof(*qcoeff) * default_eob);
-  memset(dqcoeff, 0, sizeof(*dqcoeff) * default_eob);
+
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
-#if CONFIG_AOM_QM
-    const int iwt = iqmatrix[rc];
-    const int dequant =
-        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-#endif
-    if (x) {
-      final_eob = i;
-    }
-
+    if (x) final_eob = i;
     qcoeff[rc] = x;
-#if CONFIG_AOM_QM
-    dqcoeff[rc] = (x * dequant) / mul;
-#else
-    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
-#endif
+    dqcoeff[rc] = tokens[i][best].dqc;
 
     next = tokens[i][best].next;
     best = best_index[i][best];
@@ -333,248 +400,46 @@
   final_eob++;
 
   mb->plane[plane].eobs[block] = final_eob;
+  assert(final_eob <= default_eob);
   return final_eob;
 }
-#endif
-
-// TODO(sarahparker) refactor fwd quant functions to use fwd_txfm fns in
-// hybrid_fwd_txfm.c
-void av1_xform_quant_fp(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                        int block, int blk_row, int blk_col,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-#if !CONFIG_PVQ
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-#else
-  struct macroblock_plane *const p = &x->plane[plane];
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type);
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-#if CONFIG_AOM_QM
-  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
-  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
-#endif
-#if !CONFIG_PVQ
-  const int16_t *src_diff;
-  (void)cm;
-
-  /*
-    FWD_TXFM_PARAM fwd_txfm_param;
-    fwd_txfm_param.tx_type = tx_type;
-    fwd_txfm_param.tx_size = tx_size;
-    fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
-    fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
-    fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
-  */
-
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-#else
-  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
-  uint8_t *src, *dst;
-  int16_t *src_int16, *pred;
-  const int src_stride = p->src.stride;
-  const int dst_stride = pd->dst.stride;
-  int tx_blk_size;
-  int i, j;
-  int skip = 1;
-  PVQ_INFO *pvq_info = NULL;
-
-  (void)scan_order;
-  (void)qcoeff;
-
-  if (x->pvq_coded) {
-    assert(block < MAX_PVQ_BLOCKS_IN_SB);
-    pvq_info = &x->pvq[block][plane];
-  }
-  dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
-  src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-  src_int16 = &p->src_int16[4 * (blk_row * diff_stride + blk_col)];
-  pred = &pd->pred[4 * (blk_row * diff_stride + blk_col)];
-  // transform block size in pixels
-  tx_blk_size = tx_size_1d[tx_size];
-
-  // copy uint8 orig and predicted block to int16 buffer
-  // in order to use existing VP10 transform functions
-  for (j = 0; j < tx_blk_size; j++)
-    for (i = 0; i < tx_blk_size; i++) {
-      src_int16[diff_stride * j + i] = src[src_stride * j + i];
-      pred[diff_stride * j + i] = dst[dst_stride * j + i];
-    }
-#endif
 
 #if CONFIG_AOM_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        av1_highbd_quantize_fp_32x32(
-            coeff, 1024, x->skip_block, p->zbin, p->round_fp, p->quant_fp,
-            p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-            scan_order->iscan);
-#else
-            scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      case TX_16X16:
-        aom_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        av1_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                               scan_order->iscan);
-#else
-                               scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      case TX_8X8:
-        aom_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        av1_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                               scan_order->iscan);
-#else
-                               scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      case TX_4X4:
-        if (xd->lossless[seg_id]) {
-          av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
-        } else {
-          aom_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        }
-        av1_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                               scan_order->iscan);
-#else
-                               scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      default: assert(0);
-    }
-    return;
-  }
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_HIGHBD = 1,
+  QUANT_FUNC_LAST = 2
+} QUANT_FUNC;
 
-#if !CONFIG_PVQ
-  switch (tx_size) {
-    case TX_32X32:
-      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      av1_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                            scan_order->iscan);
-#else
-                            scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    case TX_16X16:
-      aom_fdct16x16(src_diff, coeff, diff_stride);
-      av1_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                      eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                      scan_order->iscan);
-#else
-                      scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    case TX_8X8:
-      av1_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block,
-                        p->zbin, p->round_fp, p->quant_fp, p->quant_shift,
-                        qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                        scan_order->iscan);
-#else
-                        scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    case TX_4X4:
-      if (xd->lossless[seg_id]) {
-        av1_fwht4x4(src_diff, coeff, diff_stride);
-      } else {
-        aom_fdct4x4(src_diff, coeff, diff_stride);
-      }
-      av1_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                      eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                      scan_order->iscan);
-#else
-                      scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    default: assert(0); break;
-  }
-#else   // #if !CONFIG_PVQ
-  switch (tx_size) {
-    case TX_32X32:
-      // NOTE: Using x->use_lp32x32fdct == 1 will makes enc and dec mismatched,
-      // because decoder always uses x->use_lp32x32fdct == 0,
-      // forward transform of predicted image.
-      fdct32x32(0, pred, ref_coeff, diff_stride);
-      // forward transform of original image.
-      fdct32x32(0, src_int16, coeff, diff_stride);
-      break;
-    case TX_16X16:
-      aom_fdct16x16(pred, ref_coeff, diff_stride);
-      aom_fdct16x16(src_int16, coeff, diff_stride);
-      break;
-    case TX_8X8:
-      aom_fdct8x8(pred, ref_coeff, diff_stride);
-      aom_fdct8x8(src_int16, coeff, diff_stride);
-      break;
-    case TX_4X4:
-      if (xd->lossless[seg_id]) {
-        av1_fwht4x4(pred, ref_coeff, diff_stride);
-        av1_fwht4x4(src_int16, coeff, diff_stride);
-      } else {
-        aom_fdct4x4(pred, ref_coeff, diff_stride);
-        aom_fdct4x4(src_int16, coeff, diff_stride);
-      }
-      break;
-    default: assert(0); break;
-  }
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_LAST][QUANT_FUNC_LAST] =
+    { { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+      { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+      { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+      { NULL, NULL } };
 
-  // PVQ for inter mode block
-  if (!x->skip_block)
-    skip = av1_pvq_encode_helper(&x->daala_enc,
-                                 coeff,        // target original vector
-                                 ref_coeff,    // reference vector
-                                 dqcoeff,      // de-quantized vector
-                                 eob,          // End of Block marker
-                                 pd->dequant,  // aom's quantizers
-                                 plane,        // image plane
-                                 tx_size,      // block size in log_2 - 2
-                                 tx_type,
-                                 &x->rate,  // rate measured
-                                 x->pvq_speed,
-                                 pvq_info);  // PVQ info for a block
+#elif !CONFIG_PVQ
 
-  x->pvq_skip[plane] = skip;
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_LAST = 1
+} QUANT_FUNC;
 
-  if (!skip) mbmi->skip = 0;
-#endif  // #if !CONFIG_PVQ
-}
+static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_LAST]
+                                       [QUANT_FUNC_LAST] = {
+                                         { av1_quantize_fp_facade },
+                                         { av1_quantize_b_facade },
+                                         { av1_quantize_dc_facade },
+                                         { NULL }
+                                       };
+#endif
 
-void av1_xform_quant(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                     int block, int blk_row, int blk_col,
-                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+static FWD_TXFM_OPT fwd_txfm_opt_list[AV1_XFORM_QUANT_LAST] = {
+  FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_DC, FWD_TXFM_OPT_NORMAL
+};
+
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, AV1_XFORM_QUANT xform_quant_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
 #if !CONFIG_PVQ
   const struct macroblock_plane *const p = &x->plane[plane];
@@ -584,26 +449,29 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
 #endif
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const int is_inter = is_inter_block(&xd->mi[0]->mbmi);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter);
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-  FWD_TXFM_PARAM fwd_txfm_param;
-
 #if CONFIG_AOM_QM
-  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
-  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
+  int seg_id = xd->mi[0]->mbmi.segment_id;
+  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][!is_inter][tx_size];
+  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][!is_inter][tx_size];
 #endif
 
+  FWD_TXFM_PARAM fwd_txfm_param;
+
 #if !CONFIG_PVQ
+  const int tx2d_size = tx_size_2d[tx_size];
+  QUANT_PARAM qparam;
   const int16_t *src_diff;
 
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+  qparam.log_scale = get_tx_scale(xd, tx_type, tx_size);
 #else
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
@@ -629,7 +497,7 @@
   pred = &pd->pred[4 * (blk_row * diff_stride + blk_col)];
 
   // transform block size in pixels
-  tx_blk_size = tx_size_1d[tx_size];
+  tx_blk_size = tx_size_wide[tx_size];
 
   // copy uint8 orig and predicted block to int16 buffer
   // in order to use existing VP10 transform functions
@@ -642,55 +510,26 @@
 
   fwd_txfm_param.tx_type = tx_type;
   fwd_txfm_param.tx_size = tx_size;
-  fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[xform_quant_idx];
   fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
-  fwd_txfm_param.lossless = xd->lossless[seg_id];
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
 
 #if CONFIG_AOM_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-    switch (tx_size) {
-      case TX_32X32:
-        aom_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                                    scan_order->iscan);
-#else
-                                    scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      case TX_16X16:
-        aom_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                              scan_order->iscan);
-#else
-                              scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      case TX_8X8:
-        aom_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                              scan_order->iscan);
-#else
-                              scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      case TX_4X4:
-        aom_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                              scan_order->iscan);
-#else
-                              scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        break;
-      default: assert(0);
+    if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+      if (LIKELY(!x->skip_block)) {
+        quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
+            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam
+#if CONFIG_AOM_QM
+            ,
+            qmatrix, iqmatrix
+#endif  // CONFIG_AOM_QM
+            );
+      } else {
+        av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+      }
     }
     return;
   }
@@ -698,48 +537,18 @@
 
 #if !CONFIG_PVQ
   fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  switch (tx_size) {
-    case TX_32X32:
-      aom_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                           scan_order->iscan);
-#else
-                           scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    case TX_16X16:
-      aom_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan,
-#if !CONFIG_AOM_QM
-                     scan_order->iscan);
-#else
-                     scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    case TX_8X8:
-      aom_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan,
-#if !CONFIG_AOM_QM
-                     scan_order->iscan);
-#else
-                     scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    case TX_4X4:
-      aom_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan,
-#if !CONFIG_AOM_QM
-                     scan_order->iscan);
-#else
-                     scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      break;
-    default: assert(0); break;
+  if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+    if (LIKELY(!x->skip_block)) {
+      quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob, scan_order, &qparam
+#if CONFIG_AOM_QM
+          ,
+          qmatrix, iqmatrix
+#endif  // CONFIG_AOM_QM
+          );
+    } else {
+      av1_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+    }
   }
 #else   // #if !CONFIG_PVQ
   fwd_txfm_param.rd_transform = 0;
@@ -768,47 +577,343 @@
 #endif  // #if !CONFIG_PVQ
 }
 
+#if CONFIG_NEW_QUANT
+void av1_xform_quant_nuq(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+                         int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const int is_inter = is_inter_block(&xd->mi[0]->mbmi);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  const uint8_t *band = get_band_translate(tx_size);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[AV1_XFORM_QUANT_FP];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
+#if CONFIG_AOM_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    if (tx_size == TX_32X32) {
+      highbd_quantize_32x32_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant, p->quant_shift,
+          pd->dequant, (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+          dqcoeff, eob, scan_order->scan, band);
+    } else {
+      highbd_quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
+                          p->quant_shift, pd->dequant,
+                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                          qcoeff, dqcoeff, eob, scan_order->scan, band);
+    }
+    return;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  if (tx_size == TX_32X32) {
+    quantize_32x32_nuq(coeff, 1024, x->skip_block, p->quant, p->quant_shift,
+                       pd->dequant,
+                       (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                       (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                       qcoeff, dqcoeff, eob, scan_order->scan, band);
+  } else {
+    quantize_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant,
+                 p->quant_shift, pd->dequant,
+                 (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                 (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+                 dqcoeff, eob, scan_order->scan, band);
+  }
+}
+
+void av1_xform_quant_fp_nuq(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+                            int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int is_inter = is_inter_block(&xd->mi[0]->mbmi);
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, is_inter);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  const uint8_t *band = get_band_translate(tx_size);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[AV1_XFORM_QUANT_FP];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
+#if CONFIG_AOM_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    if (tx_size == TX_32X32) {
+      highbd_quantize_32x32_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+          dqcoeff, eob, scan_order->scan, band);
+    } else {
+      highbd_quantize_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp, pd->dequant,
+          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq], qcoeff,
+          dqcoeff, eob, scan_order->scan, band);
+    }
+    return;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  if (tx_size == TX_32X32) {
+    quantize_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                          p->quant_fp, pd->dequant,
+                          (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                          (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                          qcoeff, dqcoeff, eob, scan_order->scan, band);
+  } else {
+    quantize_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp,
+                    pd->dequant,
+                    (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                    (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                    qcoeff, dqcoeff, eob, scan_order->scan, band);
+  }
+}
+
+void av1_xform_quant_dc_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                            int blk_col, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  const int is_inter = is_inter_block(&xd->mi[0]->mbmi);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[AV1_XFORM_QUANT_DC];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
+#if CONFIG_AOM_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    if (tx_size == TX_32X32) {
+      highbd_quantize_dc_32x32_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
+          p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
+          pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+    } else {
+      highbd_quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                             p->quant[0], p->quant_shift[0], pd->dequant[0],
+                             p->cuml_bins_nuq[dq][0],
+                             pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+    }
+    return;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  if (tx_size == TX_32X32) {
+    quantize_dc_32x32_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                          p->quant[0], p->quant_shift[0], pd->dequant[0],
+                          p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+                          qcoeff, dqcoeff, eob);
+  } else {
+    quantize_dc_nuq(coeff, tx_size_2d[tx_size], x->skip_block, p->quant[0],
+                    p->quant_shift[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
+                    pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+  }
+}
+
+void av1_xform_quant_dc_fp_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                               int blk_col, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  const int is_inter = is_inter_block(&xd->mi[0]->mbmi);
+  int dq = get_dq_profile_from_ctx(x->qindex, ctx, is_inter, plane_type);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  assert((x->qindex == 0) ^ (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0));
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[AV1_XFORM_QUANT_DC];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
+#if CONFIG_AOM_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    if (tx_size == TX_32X32) {
+      highbd_quantize_dc_32x32_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
+          pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+          qcoeff, dqcoeff, eob);
+    } else {
+      highbd_quantize_dc_fp_nuq(
+          coeff, tx_size_2d[tx_size], x->skip_block, p->quant_fp[0],
+          pd->dequant[0], p->cuml_bins_nuq[dq][0], pd->dequant_val_nuq[dq][0],
+          qcoeff, dqcoeff, eob);
+    }
+    return;
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  if (tx_size == TX_32X32) {
+    quantize_dc_32x32_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                             p->quant_fp[0], pd->dequant[0],
+                             p->cuml_bins_nuq[dq][0],
+                             pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+  } else {
+    quantize_dc_fp_nuq(coeff, tx_size_2d[tx_size], x->skip_block,
+                       p->quant_fp[0], pd->dequant[0], p->cuml_bins_nuq[dq][0],
+                       pd->dequant_val_nuq[dq][0], qcoeff, dqcoeff, eob);
+  }
+}
+#endif  // CONFIG_NEW_QUANT
+
 static void encode_block(int plane, int block, int blk_row, int blk_col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
-  const AV1_COMMON *const cm = args->cm;
+  AV1_COMMON *cm = args->cm;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx *const ctx = args->ctx;
+  int ctx;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
-  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
+  INV_TXFM_PARAM inv_txfm_param;
 #if CONFIG_PVQ
   int tx_blk_size;
   int i, j;
 #endif
+#if CONFIG_VAR_TX
+  int i;
+  const int bwl = b_width_log2_lookup[plane_bsize];
+#endif
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
-  a = &ctx->ta[plane][blk_col];
-  l = &ctx->tl[plane][blk_row];
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+#if CONFIG_VAR_TX
+  ctx = get_entropy_context(tx_size, a, l);
+#else
+  ctx = combine_entropy_contexts(*a, *l);
+#endif
 
-  if (x->quant_fp) {
-    av1_xform_quant_fp(cm, x, plane, block, blk_row, blk_col, plane_bsize,
-                       tx_size);
-  } else {
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
-                    tx_size);
+#if CONFIG_VAR_TX
+  // Assert not magic number (uninitialized).
+  assert(x->blk_skip[plane][(blk_row << bwl) + blk_col] != 234);
+
+  if (x->blk_skip[plane][(blk_row << bwl) + blk_col] == 0) {
+#else
+  {
+#endif
+#if CONFIG_NEW_QUANT
+    av1_xform_quant_fp_nuq(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                           tx_size, ctx);
+#else
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
   }
-
+#if CONFIG_VAR_TX
+  else {
+    p->eobs[block] = 0;
+  }
+#endif
 #if !CONFIG_PVQ
-  if (x->optimize) {
-    const int combined_ctx = combine_entropy_contexts(*a, *l);
-    *a = *l = optimize_b(cm, x, plane, block, tx_size, combined_ctx) > 0;
+  if (p->eobs[block]) {
+    *a = *l = av1_optimize_b(cm, x, plane, block, tx_size, ctx) > 0;
   } else {
     *a = *l = p->eobs[block] > 0;
   }
 
+#if CONFIG_VAR_TX
+  for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) a[i] = a[0];
+
+  for (i = 0; i < tx_size_high_unit[tx_size]; ++i) l[i] = l[0];
+#endif
+
   if (p->eobs[block]) *(args->skip) = 0;
 
   if (p->eobs[block] == 0) return;
 #else
+  (void)ctx;
   *a = *l = !x->pvq_skip[plane];
 
   if (!x->pvq_skip[plane]) *(args->skip) = 0;
@@ -816,7 +921,7 @@
   if (x->pvq_skip[plane]) return;
 
   // transform block size in pixels
-  tx_blk_size = tx_size_1d[tx_size];
+  tx_blk_size = tx_size_wide[tx_size];
 
   // Since av1 does not have separate function which does inverse transform
   // but av1_inv_txfm_add_*x*() also does addition of predicted image to
@@ -826,58 +931,70 @@
     for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
 #endif
 
+  // inverse transform parameters
+  inv_txfm_param.tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+  inv_txfm_param.tx_size = tx_size;
+  inv_txfm_param.eob = p->eobs[block];
+  inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
 #if CONFIG_AOM_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        av1_highbd_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride,
-                                      p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_16X16:
-        av1_highbd_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride,
-                                      p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_8X8:
-        av1_highbd_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride,
-                                    p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_4X4:
-        // this is like av1_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride,
-                                    p->eobs[block], xd->bd, tx_type,
-                                    xd->lossless[xd->mi[0]->mbmi.segment_id]);
-        break;
-      default: assert(0 && "Invalid transform size"); break;
-    }
-
+    inv_txfm_param.bd = xd->bd;
+    highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &inv_txfm_param);
     return;
   }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-  switch (tx_size) {
-    case TX_32X32:
-      av1_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                             tx_type);
-      break;
-    case TX_16X16:
-      av1_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                             tx_type);
-      break;
-    case TX_8X8:
-      av1_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                           tx_type);
-      break;
-    case TX_4X4:
-      // this is like av1_short_idct4x4 but has a special case around eob<=1
-      // which is significant (not just an optimization) for the lossless
-      // case.
-      av1_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                           tx_type, xd->lossless[xd->mi[0]->mbmi.segment_id]);
-      break;
-    default: assert(0 && "Invalid transform size"); break;
+  inv_txfm_add(dqcoeff, dst, pd->dst.stride, &inv_txfm_param);
+}
+
+#if CONFIG_VAR_TX
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE plane_tx_size;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    // This is the square transform block partition entry point.
+    int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+    assert(bsl > 0);
+#if CONFIG_EXT_TX
+    assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) * bsl);
+      const int offsetc = blk_col + ((i & 0x01) * bsl);
+      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+                         arg);
+      block += step;
+    }
   }
 }
+#endif
 
 typedef struct encode_block_pass1_args {
   AV1_COMMON *cm;
@@ -895,22 +1012,30 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *dst;
+#if CONFIG_NEW_QUANT
+  int ctx;
+#endif  // CONFIG_NEW_QUANT
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
 
-  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-
+#if CONFIG_NEW_QUANT
+  ctx = 0;
+  av1_xform_quant_fp_nuq(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                         tx_size, ctx);
+#else
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  AV1_XFORM_QUANT_B);
+#endif  // CONFIG_NEW_QUANT
 #if !CONFIG_PVQ
   if (p->eobs[block] > 0) {
 #else
   if (!x->pvq_skip[plane]) {
 #endif
-
 #if CONFIG_PVQ
     {
       int tx_blk_size;
       int i, j;
       // transform block size in pixels
-      tx_blk_size = tx_size_1d[tx_size];
+      tx_blk_size = tx_size_wide[tx_size];
 
       // Since av1 does not have separate function which does inverse transform
       // but av1_inv_txfm_add_*x*() also does addition of predicted image to
@@ -920,10 +1045,9 @@
         for (i = 0; i < tx_blk_size; i++) dst[j * pd->dst.stride + i] = 0;
     }
 #endif
-
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless[0]) {
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
         av1_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block],
                                xd->bd);
       } else {
@@ -933,7 +1057,7 @@
       return;
     }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-    if (xd->lossless[0]) {
+    if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
       av1_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
     } else {
       av1_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
@@ -952,7 +1076,7 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip };
+  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
   int plane;
 
   mbmi->skip = 1;
@@ -960,56 +1084,114 @@
   if (x->skip) return;
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+    // TODO(jingning): Clean this up.
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+    const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    av1_get_entropy_contexts(bsize, TX_4X4, pd, ctx.ta[plane], ctx.tl[plane]);
+#else
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+#endif
 #if !CONFIG_PVQ
     av1_subtract_plane(x, bsize, plane);
 #endif
-    if (x->optimize) {
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
-      av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane],
-                               ctx.tl[plane]);
-    }
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
 
+#if CONFIG_VAR_TX
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    if (is_rect_tx(mbmi->tx_size)) {
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                             &arg);
+    } else {
+#endif
+      for (idy = 0; idy < mi_height; idy += bh) {
+        for (idx = 0; idx < mi_width; idx += bw) {
+          encode_block_inter(plane, block, idy, idx, plane_bsize, max_tx_size,
+                             &arg);
+          block += step;
+        }
+      }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    }
+#endif
+#else
+    av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                           &arg);
+#endif
+  }
+}
+
+#if CONFIG_SUPERTX
+void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = { cm, x, &ctx, &mbmi->skip, NULL, NULL, 1 };
+  int plane;
+
+  mbmi->skip = 1;
+  if (x->skip) return;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_VAR_TX
+    const TX_SIZE tx_size = TX_4X4;
+#else
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+#endif
+    av1_subtract_plane(x, bsize, plane);
+    av1_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
     av1_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
                                            &arg);
   }
 }
+#endif  // CONFIG_SUPERTX
 
 void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                             void *arg) {
   struct encode_b_args *const args = arg;
-  MACROBLOCK *const x = args->x;
+#if !CONFIG_PVQ
   AV1_COMMON *cm = args->cm;
+#endif
+  MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
   PREDICTION_MODE mode;
-  const int bwl = b_width_log2_lookup[plane_bsize];
-  const int bhl = b_height_log2_lookup[plane_bsize];
-  const int diff_stride = 4 * (1 << bwl);
+  const int diff_stride = block_size_wide[plane_bsize];
   uint8_t *src, *dst;
+  int16_t *src_diff;
   uint16_t *eob = &p->eobs[block];
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-#if CONFIG_AOM_QM
-  int is_intra = !is_inter_block(&xd->mi[0]->mbmi);
-  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][tx_size];
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][tx_size];
-#endif
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+  const int tx1d_width = tx_size_wide[tx_size];
+  const int tx1d_height = tx_size_high[tx_size];
+#if !CONFIG_PVQ
+  ENTROPY_CONTEXT *a = NULL, *l = NULL;
+  int ctx;
+  INV_TXFM_PARAM inv_txfm_param;
+#else
   FWD_TXFM_PARAM fwd_txfm_param;
-  int16_t *src_diff;
-  int tx1d_size = tx_size_1d[tx_size];
-
-#if CONFIG_PVQ
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
   int16_t *src_int16;
   int tx_blk_size;
@@ -1017,9 +1199,7 @@
   int16_t *pred = &pd->pred[4 * (blk_row * diff_stride + blk_col)];
   int skip = 1;
   PVQ_INFO *pvq_info = NULL;
-
-  (void)scan_order;
-  (void)qcoeff;
+  int seg_id = xd->mi[0]->mbmi.segment_id;
 
   if (x->pvq_coded) {
     assert(block < MAX_PVQ_BLOCKS_IN_SB);
@@ -1028,151 +1208,73 @@
   src_int16 = &p->src_int16[4 * (blk_row * diff_stride + blk_col)];
 #endif
 
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-
-  fwd_txfm_param.tx_type = tx_type;
-  fwd_txfm_param.tx_size = tx_size;
-  fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
-  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
-  fwd_txfm_param.lossless = xd->lossless[seg_id];
+  assert(tx1d_width == tx1d_height);
 
   dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
   mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-  av1_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride, dst,
-                          dst_stride, blk_col, blk_row, plane);
-
+  av1_predict_intra_block(xd, pd->width, pd->height, tx_size, mode, dst,
+                          dst_stride, dst, dst_stride, blk_col, blk_row, plane);
 #if CONFIG_AOM_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    aom_highbd_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
-                              src_stride, dst, dst_stride, xd->bd);
-    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-
-    switch (tx_size) {
-      case TX_32X32:
-        aom_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                                    scan_order->iscan);
-#else
-                                    scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        if (*eob)
-          av1_highbd_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                        tx_type);
-        break;
-      case TX_16X16:
-        aom_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                              scan_order->iscan);
-#else
-                              scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        if (*eob)
-          av1_highbd_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                        tx_type);
-        break;
-      case TX_8X8:
-        aom_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                              scan_order->iscan);
-#else
-                              scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        if (*eob)
-          av1_highbd_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                      tx_type);
-        break;
-      case TX_4X4:
-        aom_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                              scan_order->iscan);
-#else
-                              scan_order->iscan, qmatrix, iqmatrix);
-#endif
-        if (*eob)
-          // this is like av1_short_idct4x4 but has a special case around
-          // eob<=1 which is significant (not just an optimization) for the
-          // lossless case.
-          av1_highbd_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                      tx_type, xd->lossless[seg_id]);
-        break;
-      default: assert(0); return;
-    }
-    if (*eob) *(args->skip) = 0;
-    return;
+    aom_highbd_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride,
+                              src, src_stride, dst, dst_stride, xd->bd);
+  } else {
+    aom_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride, src,
+                       src_stride, dst, dst_stride);
   }
+#else
+  aom_subtract_block(tx1d_height, tx1d_width, src_diff, diff_stride, src,
+                     src_stride, dst, dst_stride);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-  aom_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
-                     src_stride, dst, dst_stride);
-
 #if !CONFIG_PVQ
-  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
-  switch (tx_size) {
-    case TX_32X32:
-      aom_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-#if !CONFIG_AOM_QM
-                           scan_order->iscan);
-#else
-                           scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      if (*eob) av1_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_16X16:
-      aom_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan,
-#if !CONFIG_AOM_QM
-                     scan_order->iscan);
-#else
-                     scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      if (*eob) av1_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_8X8:
-      aom_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan,
-#if !CONFIG_AOM_QM
-                     scan_order->iscan);
-#else
-                     scan_order->iscan, qmatrix, iqmatrix);
-#endif
-      if (*eob) av1_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_4X4:
-      aom_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan,
-#if !CONFIG_AOM_QM
-                     scan_order->iscan);
-#else
-                     scan_order->iscan, qmatrix, iqmatrix);
-#endif
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+  ctx = combine_entropy_contexts(*a, *l);
 
-      if (*eob) {
-        // this is like av1_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        av1_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, tx_type,
-                             xd->lossless[seg_id]);
-      }
-      break;
-    default: assert(0); break;
+  if (args->enable_optimize_b) {
+#if CONFIG_NEW_QUANT
+    av1_xform_quant_fp_nuq(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                           tx_size, ctx);
+#else   // CONFIG_NEW_QUANT
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+    if (p->eobs[block]) {
+      *a = *l = av1_optimize_b(cm, x, plane, block, tx_size, ctx) > 0;
+    } else {
+      *a = *l = 0;
+    }
+  } else {
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    AV1_XFORM_QUANT_B);
+    *a = *l = p->eobs[block] > 0;
+  }
+
+  if (*eob) {
+    // inverse transform
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = *eob;
+    inv_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+#if CONFIG_AOM_HIGHBITDEPTH
+    inv_txfm_param.bd = xd->bd;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      highbd_inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+    } else {
+      inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+    }
+#else
+    inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+    *(args->skip) = 0;
   }
 #else   // #if !CONFIG_PVQ
   // transform block size in pixels
-  tx_blk_size = tx_size_1d[tx_size];
+  tx_blk_size = tx_size_wide[tx_size];
 
   // copy uint8 orig and predicted block to int16 buffer
   // in order to use existing VP10 transform functions
@@ -1183,7 +1285,10 @@
     }
 
   fwd_txfm_param.rd_transform = 0;
-
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+  fwd_txfm_param.lossless = xd->lossless[mbmi->segment_id];
   fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
   fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
 
@@ -1245,10 +1350,21 @@
 }
 
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
-                                  BLOCK_SIZE bsize, int plane) {
+                                  BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args arg = { cm, x, NULL, &xd->mi[0]->mbmi.skip };
+  ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
 
+  struct encode_b_args arg = {
+    cm, x, NULL, &xd->mi[0]->mbmi.skip, ta, tl, enable_optimize_b
+  };
+  if (enable_optimize_b) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size =
+        plane ? get_uv_tx_size(&xd->mi[0]->mbmi, pd) : xd->mi[0]->mbmi.tx_size;
+    av1_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
+  }
   av1_foreach_transformed_block_in_plane(xd, bsize, plane,
                                          av1_encode_block_intra, &arg);
 }
@@ -1259,7 +1375,7 @@
                           uint16_t *eob, const int16_t *quant, int plane,
                           int tx_size, TX_TYPE tx_type, int *rate, int speed,
                           PVQ_INFO *pvq_info) {
-  const int tx_blk_size = tx_size_1d[tx_size];
+  const int tx_blk_size = tx_size_wide[tx_size];
   int skip;
   // TODO(yushin): Enable this later, when pvq_qm_q4 is available in AOM.
   // int pvq_dc_quant = OD_MAXI(1,
@@ -1370,7 +1486,7 @@
                             int skip_dir,
                             int bs) {  // block size in log_2 -2
   int i;
-  const int tx_blk_size = tx_size_1d[bs];
+  const int tx_blk_size = tx_size_wide[bs];
 
   for (i = 0; i < nb_bands; i++) {
     pvq_info->qg[i] = qg[i];

diff --git a/av1/encoder/encodemb.h b/av1/encoder/encodemb.h
index c5f2ac1..2f2b93b 100644
--- a/av1/encoder/encodemb.h
+++ b/av1/encoder/encodemb.h

@@ -20,20 +20,54 @@
 extern "C" {
 #endif
 
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+};
+
 struct encode_b_args {
   AV1_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
   int8_t *skip;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  int8_t enable_optimize_b;
 };
+
+typedef enum AV1_XFORM_QUANT {
+  AV1_XFORM_QUANT_FP = 0,
+  AV1_XFORM_QUANT_B = 1,
+  AV1_XFORM_QUANT_DC = 2,
+  AV1_XFORM_QUANT_SKIP_QUANT = 3,
+  AV1_XFORM_QUANT_LAST = 4
+} AV1_XFORM_QUANT;
+
 void av1_encode_sb(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void av1_encode_sb_supertx(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif  // CONFIG_SUPERTX
 void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
-void av1_xform_quant_fp(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                        int block, int blk_row, int blk_col,
-                        BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
-void av1_xform_quant(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                     int block, int blk_row, int blk_col,
-                     BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+                     int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+                     TX_SIZE tx_size, AV1_XFORM_QUANT xform_quant_idx);
+#if CONFIG_NEW_QUANT
+void av1_xform_quant_nuq(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+                         int block, int blk_row, int blk_col,
+                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int ctx);
+void av1_xform_quant_dc_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                            int blk_col, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, int ctx);
+void av1_xform_quant_fp_nuq(const AV1_COMMON *cm, MACROBLOCK *x, int plane,
+                            int block, int blk_row, int blk_col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int ctx);
+void av1_xform_quant_dc_fp_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                               int blk_col, BLOCK_SIZE plane_bsize,
+                               TX_SIZE tx_size, int ctx);
+#endif
+
+int av1_optimize_b(const AV1_COMMON *cm, MACROBLOCK *mb, int plane, int block,
+                   TX_SIZE tx_size, int ctx);
 
 void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
@@ -41,7 +75,8 @@
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
 void av1_encode_intra_block_plane(AV1_COMMON *cm, MACROBLOCK *x,
-                                  BLOCK_SIZE bsize, int plane);
+                                  BLOCK_SIZE bsize, int plane,
+                                  int enable_optimize_b);
 
 #if CONFIG_PVQ
 int av1_pvq_encode_helper(daala_enc_ctx *daala_enc, tran_low_t *const coeff,

diff --git a/av1/encoder/encodemv.c b/av1/encoder/encodemv.c
index 3e5184a..8a6ad18 100644
--- a/av1/encoder/encodemv.c
+++ b/av1/encoder/encodemv.c

@@ -171,7 +171,6 @@
                          nmv_context_counts *const nmv_counts) {
   int i;
 #if CONFIG_REF_MV
-  int j;
   int nmv_ctx = 0;
   for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
     nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
@@ -181,6 +180,7 @@
                     w);
 
     for (i = 0; i < 2; ++i) {
+      int j;
       nmv_component *comp = &mvc->comps[i];
       nmv_component_counts *comp_counts = &counts->comps[i];
 
@@ -194,6 +194,7 @@
     }
 
     for (i = 0; i < 2; ++i) {
+      int j;
       for (j = 0; j < CLASS0_SIZE; ++j)
         write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
                         counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
@@ -215,11 +216,11 @@
   nmv_context *const mvc = &cm->fc->nmvc;
   nmv_context_counts *const counts = nmv_counts;
 
-  int j;
 #if !CONFIG_EC_ADAPT
   write_mv_update(av1_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
   for (i = 0; i < 2; ++i) {
+    int j;
     nmv_component *comp = &mvc->comps[i];
     nmv_component_counts *comp_counts = &counts->comps[i];
 
@@ -233,6 +234,7 @@
   }
 
   for (i = 0; i < 2; ++i) {
+    int j;
     for (j = 0; j < CLASS0_SIZE; ++j) {
       write_mv_update(av1_mv_fp_tree, mvc->comps[i].class0_fp[j],
                       counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
@@ -253,10 +255,15 @@
 }
 
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+#if CONFIG_REF_MV
+                   int is_compound,
+#endif
                    nmv_context *mvctx, int usehp) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
   const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
-
+#if CONFIG_REF_MV
+  (void)is_compound;
+#endif
 #if CONFIG_EC_MULTISYMBOL
   aom_write_symbol(w, j, mvctx->joint_cdf, MV_JOINTS);
 #else
@@ -283,26 +290,144 @@
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
+#if CONFIG_EXT_INTER
 static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
-                    const int_mv mvs[2], nmv_context_counts *nmv_counts) {
+                    const int_mv mvs[2],
+#if CONFIG_REF_MV
+                    const int_mv pred_mvs[2],
+#endif
+                    nmv_context_counts *nmv_counts) {
   int i;
+  PREDICTION_MODE mode = mbmi->mode;
+  int mv_idx = (mode == NEWFROMNEARMV);
+#if !CONFIG_REF_MV
+  nmv_context_counts *counts = nmv_counts;
+#endif
+
+  if (mode == NEWMV || mode == NEWFROMNEARMV || mode == NEW_NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][mv_idx].as_mv;
+      const MV diff = { mvs[i].as_mv.row - ref->row,
+                        mvs[i].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx =
+          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+      (void)pred_mvs;
+#endif
+      av1_inc_mv(&diff, counts, 1);
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
+    const MV diff = { mvs[1].as_mv.row - ref->row,
+                      mvs[1].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+    const MV diff = { mvs[0].as_mv.row - ref->row,
+                      mvs[0].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  }
+}
+
+static void inc_mvs_sub8x8(const MODE_INFO *mi, int block, const int_mv mvs[2],
+#if CONFIG_REF_MV
+                           const MB_MODE_INFO_EXT *mbmi_ext,
+#endif
+                           nmv_context_counts *nmv_counts) {
+  int i;
+  PREDICTION_MODE mode = mi->bmi[block].as_mode;
+#if CONFIG_REF_MV
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+#else
+  nmv_context_counts *counts = nmv_counts;
+#endif
+
+  if (mode == NEWMV || mode == NEWFROMNEARMV || mode == NEW_NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) {
+      const MV *ref = &mi->bmi[block].ref_mv[i].as_mv;
+      const MV diff = { mvs[i].as_mv.row - ref->row,
+                        mvs[i].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+      int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+      int nmv_ctx =
+          av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                      mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+      av1_inc_mv(&diff, counts, 1);
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+    const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
+    const MV diff = { mvs[1].as_mv.row - ref->row,
+                      mvs[1].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 1, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
+    const MV diff = { mvs[0].as_mv.row - ref->row,
+                      mvs[0].as_mv.col - ref->col };
+#if CONFIG_REF_MV
+    int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
+    int nmv_ctx =
+        av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
+                    mbmi_ext->ref_mv_stack[rf_type], 0, mbmi->ref_mv_idx);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    av1_inc_mv(&diff, counts, 1);
+  }
+}
+#else
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+                    const int_mv mvs[2],
+#if CONFIG_REF_MV
+                    const int_mv pred_mvs[2],
+#endif
+                    nmv_context_counts *nmv_counts) {
+  int i;
+#if !CONFIG_REF_MV
+  nmv_context_counts *counts = nmv_counts;
+#endif
 
   for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
-    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
-    const MV diff = { mvs[i].as_mv.row - ref->row,
-                      mvs[i].as_mv.col - ref->col };
 #if CONFIG_REF_MV
     int8_t rf_type = av1_ref_frame_type(mbmi->ref_frame);
     int nmv_ctx =
         av1_nmv_ctx(mbmi_ext->ref_mv_count[rf_type],
                     mbmi_ext->ref_mv_stack[rf_type], i, mbmi->ref_mv_idx);
     nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+    const MV *ref = &pred_mvs[i].as_mv;
 #else
-    nmv_context_counts *counts = nmv_counts;
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
 #endif
+    const MV diff = { mvs[i].as_mv.row - ref->row,
+                      mvs[i].as_mv.col - ref->col };
     av1_inc_mv(&diff, counts, 1);
   }
 }
+#endif  // CONFIG_EXT_INTER
 
 void av1_update_mv_count(ThreadData *td) {
   const MACROBLOCKD *xd = &td->mb.e_mbd;
@@ -318,20 +443,35 @@
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
+
+#if CONFIG_EXT_INTER
+        if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
+          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv,
+#if CONFIG_REF_MV
+                         mbmi_ext, td->counts->mv);
+#else
+                         &td->counts->mv);
+#endif
+#else
         if (mi->bmi[i].as_mode == NEWMV)
           inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv,
 #if CONFIG_REF_MV
-                  td->counts->mv);
+                  mi->bmi[i].pred_mv, td->counts->mv);
 #else
                   &td->counts->mv);
 #endif
+#endif  // CONFIG_EXT_INTER
       }
     }
   } else {
+#if CONFIG_EXT_INTER
+    if (have_newmv_in_inter_mode(mbmi->mode))
+#else
     if (mbmi->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
       inc_mvs(mbmi, mbmi_ext, mbmi->mv,
 #if CONFIG_REF_MV
-              td->counts->mv);
+              mbmi->pred_mv, td->counts->mv);
 #else
               &td->counts->mv);
 #endif

diff --git a/av1/encoder/encodemv.h b/av1/encoder/encodemv.h
index 5fea652..17baa2d 100644
--- a/av1/encoder/encodemv.h
+++ b/av1/encoder/encodemv.h

@@ -24,6 +24,9 @@
                          nmv_context_counts *const counts);
 
 void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+#if CONFIG_REF_MV
+                   int is_compound,
+#endif
                    nmv_context *mvctx, int usehp);
 
 void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],

diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 13f54c9..2bd2001 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c

@@ -34,6 +34,9 @@
 #include "av1/encoder/aq_cyclicrefresh.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/bitstream.h"
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#endif
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodeframe.h"
 #include "av1/encoder/encodemv.h"
@@ -42,6 +45,9 @@
 #include "av1/encoder/firstpass.h"
 #include "av1/encoder/mbgraph.h"
 #include "av1/encoder/picklpf.h"
+#if CONFIG_LOOP_RESTORATION
+#include "av1/encoder/pickrst.h"
+#endif  // CONFIG_LOOP_RESTORATION
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/resize.h"
@@ -49,18 +55,15 @@
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/temporal_filter.h"
 
+#include "./av1_rtcd.h"
 #include "./aom_dsp_rtcd.h"
 #include "./aom_scale_rtcd.h"
-#include "./av1_rtcd.h"
 #include "aom_dsp/psnr.h"
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
 #endif
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#endif  // CONFIG_ANS
 #include "aom_ports/aom_timer.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/system_state.h"
@@ -81,18 +84,16 @@
                                        // now so that HIGH_PRECISION is always
                                        // chosen.
 // #define OUTPUT_YUV_REC
-
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
-#endif  // OUTPUT_YUV_DENOISED
-
+#endif
 #ifdef OUTPUT_YUV_SKINMAP
 FILE *yuv_skinmap_file = NULL;
-#endif  // OUTPUT_YUV_SKINMAP
-
+#endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
-#endif  // OUTPUT_YUV_REC
+#define FILE_NAME_LEN 100
+#endif
 
 #if 0
 FILE *framepsnr;
@@ -100,6 +101,10 @@
 FILE *keyfile;
 #endif
 
+#if CONFIG_INTERNAL_STATS
+typedef enum { Y, U, V, ALL } STAT_TYPE;
+#endif  // CONFIG_INTERNAL_STATS
+
 static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
   switch (mode) {
     case NORMAL:
@@ -246,14 +251,37 @@
 #else
   if (cpi->common.allow_high_precision_mv) {
     mb->mvcost = mb->nmvcost_hp;
-    mb->mvsadcost = mb->nmvsadcost_hp;
+    mb->mvsadcost = mb->nmvcost_hp;
   } else {
     mb->mvcost = mb->nmvcost;
-    mb->mvsadcost = mb->nmvsadcost;
+    mb->mvsadcost = mb->nmvcost;
   }
 #endif
 }
 
+static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
+    return BLOCK_64X64;
+
+  if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+    return BLOCK_128X128;
+
+  assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+  assert(IMPLIES(cpi->common.tile_cols > 1,
+                 cpi->common.tile_width % MAX_MIB_SIZE == 0));
+  assert(IMPLIES(cpi->common.tile_rows > 1,
+                 cpi->common.tile_height % MAX_MIB_SIZE == 0));
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  return BLOCK_128X128;
+#else
+  (void)cpi;
+  return BLOCK_64X64;
+#endif  //  CONFIG_EXT_PARTITION
+}
+
 static void setup_frame(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   // Set up entropy context depending on frame type. The decoder mandates
@@ -264,8 +292,15 @@
   if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
     av1_setup_past_independence(cm);
   } else {
-    if (cpi->refresh_alt_ref_frame)
+#if CONFIG_EXT_REFS
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+      cm->frame_context_idx = EXT_ARF_FRAME;
+    else if (cpi->refresh_alt_ref_frame)
       cm->frame_context_idx = ARF_FRAME;
+#else
+    if (cpi->refresh_alt_ref_frame) cm->frame_context_idx = ARF_FRAME;
+#endif  // CONFIG_EXT_REFS
     else if (cpi->rc.is_src_frame_alt_ref)
       cm->frame_context_idx = OVERLAY_FRAME;
     else if (cpi->refresh_golden_frame)
@@ -273,7 +308,7 @@
 #if CONFIG_EXT_REFS
     else if (cpi->refresh_bwd_ref_frame)
       cm->frame_context_idx = BRF_FRAME;
-#endif
+#endif  // CONFIG_EXT_REFS
     else
       cm->frame_context_idx = REGULAR_FRAME;
   }
@@ -286,6 +321,10 @@
     *cm->fc = cm->frame_contexts[cm->frame_context_idx];
     av1_zero(cpi->interp_filter_selected[0]);
   }
+
+  cpi->vaq_refresh = 0;
+
+  set_sb_size(cm, select_sb_size(cpi));
 }
 
 static void av1_enc_setup_mi(AV1_COMMON *cm) {
@@ -362,6 +401,9 @@
     av1_rc_init_minq_luts();
     av1_entropy_mv_init();
     av1_encode_token_init();
+#if CONFIG_EXT_INTER
+    av1_init_wedge_masks();
+#endif
     init_done = 1;
   }
 }
@@ -401,13 +443,17 @@
   cpi->active_map.map = NULL;
 
   // Free up-sampled reference buffers.
-  for (i = 0; i < MAX_UPSAMPLED_BUFS; i++)
+  for (i = 0; i < (REF_FRAMES + 1); i++)
     aom_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
 
   av1_free_ref_frame_buffers(cm->buffer_pool);
   av1_free_context_buffers(cm);
 
   aom_free_frame_buffer(&cpi->last_frame_uf);
+#if CONFIG_LOOP_RESTORATION
+  aom_free_frame_buffer(&cpi->last_frame_db);
+  av1_free_restoration_buffers(cm);
+#endif  // CONFIG_LOOP_RESTORATION
   aom_free_frame_buffer(&cpi->scaled_source);
   aom_free_frame_buffer(&cpi->scaled_last_source);
   aom_free_frame_buffer(&cpi->alt_ref_buffer);
@@ -417,6 +463,7 @@
   cpi->tile_tok[0][0] = 0;
 
   av1_free_pc_tree(&cpi->td);
+  av1_free_var_tree(&cpi->td);
 
 #if CONFIG_PALETTE
   if (cpi->common.allow_screen_content_tools)
@@ -455,6 +502,7 @@
 
   av1_copy(cc->nmvcosts, cpi->nmvcosts);
   av1_copy(cc->nmvcosts_hp, cpi->nmvcosts_hp);
+
   av1_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   av1_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
@@ -482,6 +530,7 @@
 
   av1_copy(cpi->nmvcosts, cc->nmvcosts);
   av1_copy(cpi->nmvcosts_hp, cc->nmvcosts_hp);
+
   av1_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   av1_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
@@ -667,6 +716,18 @@
     aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
+#if CONFIG_LOOP_RESTORATION
+  if (aom_realloc_frame_buffer(&cpi->last_frame_db, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_AOM_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame deblocked buffer");
+#endif  // CONFIG_LOOP_RESTORATION
+
   if (aom_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -724,15 +785,61 @@
   av1_rc_update_framerate(cpi);
 }
 
-static void set_tile_limits(AV1_COMP *cpi) {
+static void set_tile_info(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
 
+#if CONFIG_EXT_TILE
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size != AOM_SUPERBLOCK_SIZE_64X64) {
+    cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 32);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
+    cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+  } else {
+    cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+    cm->tile_width <<= MAX_MIB_SIZE_LOG2 - 1;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
+  }
+#else
+  cm->tile_width = clamp(cpi->oxcf.tile_columns, 1, 64);
+  cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+  cm->tile_width <<= MAX_MIB_SIZE_LOG2;
+  cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+#endif  // CONFIG_EXT_PARTITION
+
+  cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+  cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+
+  assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
+  assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
+
+  // Get the number of tiles
+  cm->tile_cols = 1;
+  while (cm->tile_cols * cm->tile_width < cm->mi_cols) ++cm->tile_cols;
+
+  cm->tile_rows = 1;
+  while (cm->tile_rows * cm->tile_height < cm->mi_rows) ++cm->tile_rows;
+#else
   int min_log2_tile_cols, max_log2_tile_cols;
   av1_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   cm->log2_tile_cols =
       clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
   cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+  cm->tile_cols = 1 << cm->log2_tile_cols;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
+
+  // round to integer multiples of max superblock size
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+#endif  // CONFIG_EXT_TILE
 }
 
 static void update_frame_size(AV1_COMP *cpi) {
@@ -749,7 +856,7 @@
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
 
-  set_tile_limits(cpi);
+  set_tile_info(cpi);
 }
 
 static void init_buffer_indices(AV1_COMP *cpi) {
@@ -760,6 +867,8 @@
   cpi->gld_fb_idx = LAST_REF_FRAMES;
   cpi->bwd_fb_idx = LAST_REF_FRAMES + 1;
   cpi->alt_fb_idx = LAST_REF_FRAMES + 2;
+  for (fb_idx = 0; fb_idx < MAX_EXT_ARFS + 1; ++fb_idx)
+    cpi->arf_map[fb_idx] = LAST_REF_FRAMES + 2 + fb_idx;
 #else
   cpi->lst_fb_idx = 0;
   cpi->gld_fb_idx = 1;
@@ -920,7 +1029,19 @@
     for (i = 0; i < 4; i++) sad_array[i] >>= 4;                               \
   }
 
-/* clang-format off */
+#if CONFIG_EXT_PARTITION
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad128x128x3)
+MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad128x128x8)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+#endif  // CONFIG_EXT_PARTITION
 MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
 MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
@@ -976,7 +1097,51 @@
 MAKE_BFP_SAD3_WRAPPER(aom_highbd_sad4x4x3)
 MAKE_BFP_SAD8_WRAPPER(aom_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
-/* clang-format on */
+
+#if CONFIG_EXT_INTER
+#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF) \
+  cpi->fn_ptr[BT].msdf = MSDF;           \
+  cpi->fn_ptr[BT].mvf = MVF;             \
+  cpi->fn_ptr[BT].msvf = MSVF;
+
+#define MAKE_MBFP_SAD_WRAPPER(fnname)                                          \
+  static unsigned int fnname##_bits8(                                          \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *m, int m_stride) {                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride);   \
+  }                                                                            \
+  static unsigned int fnname##_bits10(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *m, int m_stride) {                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
+           2;                                                                  \
+  }                                                                            \
+  static unsigned int fnname##_bits12(                                         \
+      const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, const uint8_t *m, int m_stride) {                        \
+    return fnname(src_ptr, source_stride, ref_ptr, ref_stride, m, m_stride) >> \
+           4;                                                                  \
+  }
+
+#if CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR
 #define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
@@ -1001,6 +1166,11 @@
     return fnname(ref, ref_stride, wsrc, msk) >> 4;                       \
   }
 
+#if CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
 MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
@@ -1102,7 +1272,92 @@
             aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits8,
             aom_highbd_sad4x4x8_bits8, aom_highbd_sad4x4x4d_bits8)
 
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
+                   aom_highbd_sad128x128_avg_bits8,
+                   aom_highbd_8_variance128x128,
+                   aom_highbd_8_sub_pixel_variance128x128,
+                   aom_highbd_8_sub_pixel_avg_variance128x128,
+                   aom_highbd_sad128x128x3_bits8, aom_highbd_sad128x128x8_bits8,
+                   aom_highbd_sad128x128x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
+                   aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
+                   aom_highbd_8_sub_pixel_variance128x64,
+                   aom_highbd_8_sub_pixel_avg_variance128x64, NULL, NULL,
+                   aom_highbd_sad128x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
+                   aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
+                   aom_highbd_8_sub_pixel_variance64x128,
+                   aom_highbd_8_sub_pixel_avg_variance64x128, NULL, NULL,
+                   aom_highbd_sad64x128x4d_bits8)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
+                    aom_highbd_masked_variance128x128,
+                    aom_highbd_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
+                    aom_highbd_masked_variance128x64,
+                    aom_highbd_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
+                    aom_highbd_masked_variance64x128,
+                    aom_highbd_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
+                    aom_highbd_masked_variance64x64,
+                    aom_highbd_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
+                    aom_highbd_masked_variance64x32,
+                    aom_highbd_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
+                    aom_highbd_masked_variance32x64,
+                    aom_highbd_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
+                    aom_highbd_masked_variance32x32,
+                    aom_highbd_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
+                    aom_highbd_masked_variance32x16,
+                    aom_highbd_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
+                    aom_highbd_masked_variance16x32,
+                    aom_highbd_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
+                    aom_highbd_masked_variance16x16,
+                    aom_highbd_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
+                    aom_highbd_masked_variance8x16,
+                    aom_highbd_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
+                    aom_highbd_masked_variance16x8,
+                    aom_highbd_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
+                    aom_highbd_masked_variance8x8,
+                    aom_highbd_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
+                    aom_highbd_masked_variance4x8,
+                    aom_highbd_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
+                    aom_highbd_masked_variance8x4,
+                    aom_highbd_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
+                    aom_highbd_masked_variance4x4,
+                    aom_highbd_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
+                    aom_highbd_obmc_variance128x128,
+                    aom_highbd_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
+                    aom_highbd_obmc_variance128x64,
+                    aom_highbd_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
+                    aom_highbd_obmc_variance64x128,
+                    aom_highbd_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
                     aom_highbd_obmc_variance64x64,
                     aom_highbd_obmc_sub_pixel_variance64x64)
@@ -1229,7 +1484,94 @@
             aom_highbd_10_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits10,
             aom_highbd_sad4x4x8_bits10, aom_highbd_sad4x4x4d_bits10)
 
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(
+            BLOCK_128X128, aom_highbd_sad128x128_bits10,
+            aom_highbd_sad128x128_avg_bits10, aom_highbd_10_variance128x128,
+            aom_highbd_10_sub_pixel_variance128x128,
+            aom_highbd_10_sub_pixel_avg_variance128x128,
+            aom_highbd_sad128x128x3_bits10, aom_highbd_sad128x128x8_bits10,
+            aom_highbd_sad128x128x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
+                   aom_highbd_sad128x64_avg_bits10,
+                   aom_highbd_10_variance128x64,
+                   aom_highbd_10_sub_pixel_variance128x64,
+                   aom_highbd_10_sub_pixel_avg_variance128x64, NULL, NULL,
+                   aom_highbd_sad128x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
+                   aom_highbd_sad64x128_avg_bits10,
+                   aom_highbd_10_variance64x128,
+                   aom_highbd_10_sub_pixel_variance64x128,
+                   aom_highbd_10_sub_pixel_avg_variance64x128, NULL, NULL,
+                   aom_highbd_sad64x128x4d_bits10)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
+                    aom_highbd_10_masked_variance128x128,
+                    aom_highbd_10_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
+                    aom_highbd_10_masked_variance128x64,
+                    aom_highbd_10_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
+                    aom_highbd_10_masked_variance64x128,
+                    aom_highbd_10_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
+                    aom_highbd_10_masked_variance64x64,
+                    aom_highbd_10_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
+                    aom_highbd_10_masked_variance64x32,
+                    aom_highbd_10_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
+                    aom_highbd_10_masked_variance32x64,
+                    aom_highbd_10_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
+                    aom_highbd_10_masked_variance32x32,
+                    aom_highbd_10_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
+                    aom_highbd_10_masked_variance32x16,
+                    aom_highbd_10_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
+                    aom_highbd_10_masked_variance16x32,
+                    aom_highbd_10_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
+                    aom_highbd_10_masked_variance16x16,
+                    aom_highbd_10_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
+                    aom_highbd_10_masked_variance8x16,
+                    aom_highbd_10_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
+                    aom_highbd_10_masked_variance16x8,
+                    aom_highbd_10_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
+                    aom_highbd_10_masked_variance8x8,
+                    aom_highbd_10_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
+                    aom_highbd_10_masked_variance4x8,
+                    aom_highbd_10_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
+                    aom_highbd_10_masked_variance8x4,
+                    aom_highbd_10_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
+                    aom_highbd_10_masked_variance4x4,
+                    aom_highbd_10_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
 #if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
+                    aom_highbd_10_obmc_variance128x128,
+                    aom_highbd_10_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
+                    aom_highbd_10_obmc_variance128x64,
+                    aom_highbd_10_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
+                    aom_highbd_10_obmc_variance64x128,
+                    aom_highbd_10_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
                     aom_highbd_10_obmc_variance64x64,
                     aom_highbd_10_obmc_sub_pixel_variance64x64)
@@ -1356,7 +1698,95 @@
             aom_highbd_12_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x3_bits12,
             aom_highbd_sad4x4x8_bits12, aom_highbd_sad4x4x4d_bits12)
 
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(
+            BLOCK_128X128, aom_highbd_sad128x128_bits12,
+            aom_highbd_sad128x128_avg_bits12, aom_highbd_12_variance128x128,
+            aom_highbd_12_sub_pixel_variance128x128,
+            aom_highbd_12_sub_pixel_avg_variance128x128,
+            aom_highbd_sad128x128x3_bits12, aom_highbd_sad128x128x8_bits12,
+            aom_highbd_sad128x128x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
+                   aom_highbd_sad128x64_avg_bits12,
+                   aom_highbd_12_variance128x64,
+                   aom_highbd_12_sub_pixel_variance128x64,
+                   aom_highbd_12_sub_pixel_avg_variance128x64, NULL, NULL,
+                   aom_highbd_sad128x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
+                   aom_highbd_sad64x128_avg_bits12,
+                   aom_highbd_12_variance64x128,
+                   aom_highbd_12_sub_pixel_variance64x128,
+                   aom_highbd_12_sub_pixel_avg_variance64x128, NULL, NULL,
+                   aom_highbd_sad64x128x4d_bits12)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
+                    aom_highbd_12_masked_variance128x128,
+                    aom_highbd_12_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
+                    aom_highbd_12_masked_variance128x64,
+                    aom_highbd_12_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
+                    aom_highbd_12_masked_variance64x128,
+                    aom_highbd_12_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
+                    aom_highbd_12_masked_variance64x64,
+                    aom_highbd_12_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
+                    aom_highbd_12_masked_variance64x32,
+                    aom_highbd_12_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
+                    aom_highbd_12_masked_variance32x64,
+                    aom_highbd_12_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
+                    aom_highbd_12_masked_variance32x32,
+                    aom_highbd_12_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
+                    aom_highbd_12_masked_variance32x16,
+                    aom_highbd_12_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
+                    aom_highbd_12_masked_variance16x32,
+                    aom_highbd_12_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
+                    aom_highbd_12_masked_variance16x16,
+                    aom_highbd_12_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
+                    aom_highbd_12_masked_variance8x16,
+                    aom_highbd_12_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
+                    aom_highbd_12_masked_variance16x8,
+                    aom_highbd_12_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
+                    aom_highbd_12_masked_variance8x8,
+                    aom_highbd_12_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
+                    aom_highbd_12_masked_variance4x8,
+                    aom_highbd_12_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
+                    aom_highbd_12_masked_variance8x4,
+                    aom_highbd_12_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
+                    aom_highbd_12_masked_variance4x4,
+                    aom_highbd_12_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+
 #if CONFIG_MOTION_VAR
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
+                    aom_highbd_12_obmc_variance128x128,
+                    aom_highbd_12_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
+                    aom_highbd_12_obmc_variance128x64,
+                    aom_highbd_12_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
+                    aom_highbd_12_obmc_variance64x128,
+                    aom_highbd_12_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
         HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
                     aom_highbd_12_obmc_variance64x64,
                     aom_highbd_12_obmc_sub_pixel_variance64x64)
@@ -1445,6 +1875,9 @@
 #if CONFIG_AOM_HIGHBITDEPTH
   cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_GLOBAL_MOTION
+  cpi->td.mb.e_mbd.global_motion = cm->global_motion;
+#endif  // CONFIG_GLOBAL_MOTION
 
   if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
     rc->baseline_gf_interval = FIXED_GF_INTERVAL;
@@ -1457,18 +1890,18 @@
 #if CONFIG_EXT_REFS
   cpi->refresh_bwd_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
-  cm->refresh_frame_context = oxcf->error_resilient_mode
-                                  ? REFRESH_FRAME_CONTEXT_OFF
-                                  : oxcf->frame_parallel_decoding_mode
-                                        ? REFRESH_FRAME_CONTEXT_FORWARD
-                                        : REFRESH_FRAME_CONTEXT_BACKWARD;
+
+  cm->refresh_frame_context =
+      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
+          ? REFRESH_FRAME_CONTEXT_FORWARD
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
 
 #if CONFIG_PALETTE
   cm->allow_screen_content_tools = (cpi->oxcf.content == AOM_CONTENT_SCREEN);
   if (cm->allow_screen_content_tools) {
     MACROBLOCK *x = &cpi->td.mb;
-    if (x->palette_buffer == NULL) {
+    if (x->palette_buffer == 0) {
       CHECK_MEM_ERROR(cm, x->palette_buffer,
                       aom_memalign(16, sizeof(*x->palette_buffer)));
     }
@@ -1533,7 +1966,7 @@
   cpi->last_frame_distortion = 0;
 #endif
 
-  set_tile_limits(cpi);
+  set_tile_info(cpi);
 
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
@@ -1590,7 +2023,7 @@
 static INLINE void init_upsampled_ref_frame_bufs(AV1_COMP *cpi) {
   int i;
 
-  for (i = 0; i < MAX_UPSAMPLED_BUFS; ++i) {
+  for (i = 0; i < (REF_FRAMES + 1); ++i) {
     cpi->upsampled_ref_bufs[i].ref_count = 0;
     cpi->upsampled_ref_idx[i] = INVALID_IDX;
   }
@@ -1636,12 +2069,15 @@
   cm->current_video_frame = 0;
   cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
+  cpi->last_show_frame_buf_idx = INVALID_IDX;
 
   realloc_segmentation_maps(cpi);
 
 #if CONFIG_REF_MV
-  memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
-  memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+    memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
+  }
 #endif
 
   memset(cpi->nmvcosts, 0, sizeof(cpi->nmvcosts));
@@ -1684,7 +2120,6 @@
   if (cpi->b_calculate_psnr) {
     cpi->total_sq_error = 0;
     cpi->total_samples = 0;
-
     cpi->tot_recode_hits = 0;
     cpi->summed_quality = 0;
     cpi->summed_weights = 0;
@@ -1699,8 +2134,9 @@
   }
 
   if (cpi->b_calculate_consistency) {
-    cpi->ssim_vars = aom_malloc(sizeof(*cpi->ssim_vars) * 4 *
-                                cpi->common.mi_rows * cpi->common.mi_cols);
+    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+                    aom_malloc(sizeof(*cpi->ssim_vars) * 4 *
+                               cpi->common.mi_rows * cpi->common.mi_cols));
     cpi->worst_consistency = 100.0;
   }
 #endif
@@ -1721,7 +2157,6 @@
   cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
   cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
 #endif
-
   cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
   cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
   cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
@@ -1732,11 +2167,10 @@
 
 #ifdef OUTPUT_YUV_SKINMAP
   yuv_skinmap_file = fopen("skinmap.yuv", "ab");
-#endif  // OUTPUT_YUV_SKINMAP
-
+#endif
 #ifdef OUTPUT_YUV_REC
-  yuv_rec_file = fopen("/tmp/enc_recon.yuv", "wb");
-#endif  // OUTPUT_YUV_REC
+  yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
 
 #if 0
   framepsnr = fopen("framepsnr.stt", "a");
@@ -1790,6 +2224,20 @@
   cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
   cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
+#if CONFIG_EXT_PARTITION
+  BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+      aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+      aom_sad128x128x3, aom_sad128x128x8, aom_sad128x128x4d)
+
+  BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+      aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, NULL,
+      NULL, aom_sad128x64x4d)
+
+  BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+      aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, NULL,
+      NULL, aom_sad64x128x4d)
+#endif  // CONFIG_EXT_PARTITION
+
   BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
       aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, NULL, NULL,
       aom_sad32x16x4d)
@@ -1848,6 +2296,14 @@
   cpi->fn_ptr[BT].ovf = OVF;      \
   cpi->fn_ptr[BT].osvf = OSVF;
 
+#if CONFIG_EXT_PARTITION
+  OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+       aom_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+       aom_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+       aom_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
   OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
        aom_obmc_sub_pixel_variance64x64)
   OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
@@ -1876,6 +2332,48 @@
        aom_obmc_sub_pixel_variance4x4)
 #endif  // CONFIG_MOTION_VAR
 
+#if CONFIG_EXT_INTER
+#define MBFP(BT, MSDF, MVF, MSVF) \
+  cpi->fn_ptr[BT].msdf = MSDF;    \
+  cpi->fn_ptr[BT].mvf = MVF;      \
+  cpi->fn_ptr[BT].msvf = MSVF;
+
+#if CONFIG_EXT_PARTITION
+  MBFP(BLOCK_128X128, aom_masked_sad128x128, aom_masked_variance128x128,
+       aom_masked_sub_pixel_variance128x128)
+  MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_variance128x64,
+       aom_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_variance64x128,
+       aom_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+  MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_variance64x64,
+       aom_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_variance64x32,
+       aom_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_variance32x64,
+       aom_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_variance32x32,
+       aom_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_variance32x16,
+       aom_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_variance16x32,
+       aom_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_variance16x16,
+       aom_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_variance16x8,
+       aom_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_variance8x16,
+       aom_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_variance8x8,
+       aom_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_variance4x8,
+       aom_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_variance8x4,
+       aom_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_variance4x4,
+       aom_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+
 #if CONFIG_AOM_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
@@ -1891,11 +2389,15 @@
 #endif
 
   av1_loop_filter_init(cm);
+#if CONFIG_LOOP_RESTORATION
+  av1_loop_restoration_precal();
+#endif  // CONFIG_LOOP_RESTORATION
 
   cm->error.setjmp = 0;
 
   return cpi;
 }
+
 #define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
 
 #define SNPRINT2(H, T, V) \
@@ -1933,7 +2435,6 @@
             (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
         const double total_ssim =
             100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
-
         snprintf(headings, sizeof(headings),
                  "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                  "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
@@ -1963,7 +2464,6 @@
           SNPRINT2(results, "\t%7.3f", consistency);
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
-
         fprintf(f, "%s\t    Time\tRcErr\tAbsErr\n", headings);
         fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time,
                 rate_err, fabs(rate_err));
@@ -2001,6 +2501,7 @@
 #endif  // CONFIG_PALETTE
       aom_free(thread_data->td->counts);
       av1_free_pc_tree(thread_data->td);
+      av1_free_var_tree(thread_data->td);
       aom_free(thread_data->td);
     }
   }
@@ -2029,11 +2530,10 @@
 
 #ifdef OUTPUT_YUV_SKINMAP
   fclose(yuv_skinmap_file);
-#endif  // OUTPUT_YUV_SKINMAP
-
+#endif
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
-#endif  // OUTPUT_YUV_REC
+#endif
 
 #if 0
 
@@ -2070,7 +2570,7 @@
 }
 
 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
-  if (ref_frame_flags > 7) return -1;
+  if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
 
   cpi->ref_frame_flags = ref_frame_flags;
   return 0;
@@ -2133,9 +2633,71 @@
   return 0;
 }
 
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+  uint8_t *src = s->y_buffer;
+  int h = s->y_height;
+
+  do {
+    fwrite(src, s->y_width, 1, f);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
+  } while (--h);
+}
+#endif
+
+#if CONFIG_EXT_REFS
+static void check_show_existing_frame(AV1_COMP *cpi) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  AV1_COMMON *const cm = &cpi->common;
+  const FRAME_UPDATE_TYPE next_frame_update_type =
+      gf_group->update_type[gf_group->index];
+  const int which_arf = gf_group->arf_update_idx[gf_group->index];
+
+  if (cm->show_existing_frame == 1) {
+    cm->show_existing_frame = 0;
+  } else if (cpi->rc.is_last_bipred_frame) {
+    // NOTE(zoeliu): If the current frame is a last bi-predictive frame, it is
+    //               needed next to show the BWDREF_FRAME, which is pointed by
+    //               the last_fb_idxes[0] after reference frame buffer update
+    cpi->rc.is_last_bipred_frame = 0;
+    cm->show_existing_frame = 1;
+    cpi->existing_fb_idx_to_show = cpi->lst_fb_idxes[0];
+  } else if (cpi->is_arf_filter_off[which_arf] &&
+             (next_frame_update_type == OVERLAY_UPDATE ||
+              next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+    // Other parameters related to OVERLAY_UPDATE will be taken care of
+    // in av1_rc_get_second_pass_params(cpi)
+    cm->show_existing_frame = 1;
+    cpi->rc.is_src_frame_alt_ref = 1;
+    cpi->existing_fb_idx_to_show = cpi->alt_fb_idx;
+    cpi->is_arf_filter_off[which_arf] = 0;
+  }
+  cpi->rc.is_src_frame_ext_arf = 0;
+}
+#endif  // CONFIG_EXT_REFS
+
 #ifdef OUTPUT_YUV_REC
-void av1_write_yuv_rec_frame(AV1_COMMON *cm) {
-  YV12_BUFFER_CONFIG *s = cm->frame_to_show;
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
   uint8_t *src = s->y_buffer;
   int h = cm->height;
 
@@ -2192,7 +2754,7 @@
 
   fflush(yuv_rec_file);
 }
-#endif
+#endif  // OUTPUT_YUV_REC
 
 #if CONFIG_AOM_HIGHBITDEPTH
 static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
@@ -2253,7 +2815,10 @@
   const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
   uint8_t *const dsts[3] = { dst->y_buffer, dst->u_buffer, dst->v_buffer };
   const int dst_strides[3] = { dst->y_stride, dst->uv_stride, dst->uv_stride };
-  InterpFilterParams interp_filter_params = get_interp_filter_params(EIGHTTAP);
+  const InterpFilterParams interp_filter_params =
+      av1_get_interp_filter_params(EIGHTTAP_REGULAR);
+  const int16_t *kernel = interp_filter_params.filter_ptr;
+  const int taps = interp_filter_params.taps;
   int x, y, i;
 
   for (y = 0; y < dst_h; y += 16) {
@@ -2262,33 +2827,29 @@
         const int factor = (i == 0 || i == 3 ? 1 : 2);
         const int x_q4 = x * (16 / factor) * src_w / dst_w;
         const int y_q4 = y * (16 / factor) * src_h / dst_h;
-        const int subpel_x = x_q4 & 0xf;
-        const int subpel_y = y_q4 & 0xf;
         const int src_stride = src_strides[i];
         const int dst_stride = dst_strides[i];
         const uint8_t *src_ptr = srcs[i] +
                                  (y / factor) * src_h / dst_h * src_stride +
                                  (x / factor) * src_w / dst_w;
         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
-        const int16_t *filter_x =
-            get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
-        const int16_t *filter_y =
-            get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
 
 #if CONFIG_AOM_HIGHBITDEPTH
         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
           aom_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
-                               filter_x, 16 * src_w / dst_w, filter_y,
-                               16 * src_h / dst_h, 16 / factor, 16 / factor,
-                               bd);
+                               &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                               &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
+                               16 / factor, 16 / factor, bd);
         } else {
-          aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, filter_x,
-                        16 * src_w / dst_w, filter_y, 16 * src_h / dst_h,
+          aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                        &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                        &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                         16 / factor, 16 / factor);
         }
 #else
-        aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, filter_x,
-                      16 * src_w / dst_w, filter_y, 16 * src_h / dst_h,
+        aom_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                      &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                      &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
       }
@@ -2355,7 +2916,7 @@
 static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
   int i;
 
-  for (i = 0; i < MAX_UPSAMPLED_BUFS; i++) {
+  for (i = 0; i < (REF_FRAMES + 1); i++) {
     if (!ubufs[i].ref_count) {
       return i;
     }
@@ -2398,16 +2959,94 @@
   }
 }
 
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+                          const YV12_BUFFER_CONFIG *const ref_buf,
+                          char *file_name) {
+  int h;
+  FILE *f_ref = NULL;
+
+  if (ref_buf == NULL) {
+    printf("Frame data buffer is NULL.\n");
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  if ((f_ref = fopen(file_name, "wb")) == NULL) {
+    printf("Unable to open file %s to write.\n", file_name);
+    return AOM_CODEC_MEM_ERROR;
+  }
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+           f_ref);
+  }
+
+  fclose(f_ref);
+
+  return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    char file_name[256] = "";
+    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+             cm->current_video_frame, ref_frame);
+    dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
+  }
+}
+#endif  // DUMP_REF_FRAME_IMAGES == 1
+
+#if CONFIG_EXT_REFS
+// This function is used to shift the virtual indices of last reference frames
+// as follows:
+// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+// when the LAST_FRAME is updated.
+static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
+  int ref_frame;
+  for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
+    cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
+
+    // [0] is allocated to the current coded frame. The statistics for the
+    // reference frames start at [LAST_FRAME], i.e. [1].
+    if (!cpi->rc.is_src_frame_alt_ref) {
+      memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+             cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+             sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
+    }
+  }
+}
+#endif  // CONFIG_EXT_REFS
+
 void av1_update_reference_frames(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
   const int use_upsampled_ref = cpi->sf.use_upsampled_references;
   int new_uidx = 0;
 
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
+
   if (use_upsampled_ref) {
 #if CONFIG_EXT_REFS
     if (cm->show_existing_frame) {
       new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show];
+      // TODO(zoeliu): Once following is confirmed, remove it.
       assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0);
     } else {
 #endif  // CONFIG_EXT_REFS
@@ -2417,11 +3056,10 @@
 
       new_uidx = upsample_ref_frame(cpi, ref);
 #if CONFIG_EXT_REFS
+      assert(new_uidx != INVALID_IDX);
     }
 #endif  // CONFIG_EXT_REFS
   }
-  assert(new_uidx != -1);
-
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
@@ -2433,6 +3071,7 @@
 #endif  // CONFIG_EXT_REFS
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->alt_fb_idx],
                cm->new_fb_idx);
+
     if (use_upsampled_ref) {
       uref_cnt_fb(cpi->upsampled_ref_bufs,
                   &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
@@ -2463,20 +3102,68 @@
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
+
+#if CONFIG_EXT_REFS
+    // We need to modify the mapping accordingly
+    cpi->arf_map[0] = cpi->alt_fb_idx;
+#endif
+// TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+// cpi->interp_filter_selected[GOLDEN_FRAME]?
+#if CONFIG_EXT_REFS
+  } else if (cpi->rc.is_last_bipred_frame) {
+    // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the LAST3_FRAME
+    // by updating the virtual indices. Note that the frame BWDREF_FRAME points
+    // to now should be retired, and it should not be used before refreshed.
+    int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+    shift_last_ref_frames(cpi);
+    cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
+    cpi->bwd_fb_idx = tmp;
+
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[BWDREF_FRAME],
+           sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+  } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
+    // Deal with the special case for showing existing internal ALTREF_FRAME
+    // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
+    // by updating the virtual indices.
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    int which_arf = gf_group->arf_ref_idx[gf_group->index];
+    int tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
+
+    shift_last_ref_frames(cpi);
+    cpi->lst_fb_idxes[0] = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = tmp;
+
+    // We need to modify the mapping accordingly
+    cpi->arf_map[which_arf] = cpi->alt_fb_idx;
+
+    memcpy(cpi->interp_filter_selected[LAST_FRAME],
+           cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
+           sizeof(cpi->interp_filter_selected[ALTREF_FRAME + which_arf]));
+#endif     // CONFIG_EXT_REFS
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->alt_fb_idx;
+      int which_arf = 0;
+#if CONFIG_EXT_REFS
+      if (cpi->oxcf.pass == 2) {
+        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+        which_arf = gf_group->arf_update_idx[gf_group->index];
+        arf_idx = cpi->arf_map[which_arf];
+      }
+#else
       if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
         const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
         arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
-
+#endif  // CONFIG_EXT_REFS
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
       if (use_upsampled_ref)
         uref_cnt_fb(cpi->upsampled_ref_bufs, &cpi->upsampled_ref_idx[arf_idx],
                     new_uidx);
 
-      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME + which_arf],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
     }
@@ -2488,18 +3175,27 @@
         uref_cnt_fb(cpi->upsampled_ref_bufs,
                     &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
 
+#if !CONFIG_EXT_REFS
       if (!cpi->rc.is_src_frame_alt_ref)
+#endif  // !CONFIG_EXT_REFS
         memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
                cpi->interp_filter_selected[0],
                sizeof(cpi->interp_filter_selected[0]));
-      else
-        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-               cpi->interp_filter_selected[ALTREF_FRAME],
-               sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
     }
 
 #if CONFIG_EXT_REFS
     if (cpi->refresh_bwd_ref_frame) {
+      if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+        // We have swapped the virtual indices to allow bwd_ref_frame to use
+        // ALT0 as reference frame. We need to swap them back.
+        // NOTE: The ALT_REFs' are indexed reversely, and ALT0 refers to the
+        //       farthest ALT_REF from the first frame in the gf group.
+        int tmp = cpi->arf_map[0];
+        cpi->arf_map[0] = cpi->alt_fb_idx;
+        cpi->alt_fb_idx = cpi->bwd_fb_idx;
+        cpi->bwd_fb_idx = tmp;
+      }
+
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->bwd_fb_idx],
                  cm->new_fb_idx);
       if (use_upsampled_ref)
@@ -2547,6 +3243,15 @@
     // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1]
     int ref_frame;
 
+    if (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs) {
+      // We have swapped the virtual indices to use ALT0 as BWD_REF
+      // and we need to swap them back.
+      int tmp = cpi->arf_map[0];
+      cpi->arf_map[0] = cpi->alt_fb_idx;
+      cpi->alt_fb_idx = cpi->bwd_fb_idx;
+      cpi->bwd_fb_idx = tmp;
+    }
+
     if (cm->frame_type == KEY_FRAME) {
       for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
         ref_cnt_fb(pool->frame_bufs,
@@ -2572,28 +3277,16 @@
             new_uidx);
 
       tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES - 1];
-      for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
-        cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
 
-        if (!cpi->rc.is_src_frame_alt_ref) {
-          memcpy(cpi->interp_filter_selected[ref_frame],
-                 cpi->interp_filter_selected[ref_frame - 1],
-                 sizeof(cpi->interp_filter_selected[ref_frame - 1]));
-        }
-      }
+      shift_last_ref_frames(cpi);
       cpi->lst_fb_idxes[0] = tmp;
 
-      if (!cpi->rc.is_src_frame_alt_ref) {
-        if (cm->show_existing_frame) {
-          memcpy(cpi->interp_filter_selected[LAST_FRAME],
-                 cpi->interp_filter_selected[BWDREF_FRAME],
-                 sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
-        } else {
-          memcpy(cpi->interp_filter_selected[LAST_FRAME],
-                 cpi->interp_filter_selected[0],
-                 sizeof(cpi->interp_filter_selected[0]));
-        }
-      }
+      assert(cm->show_existing_frame == 0);
+      // NOTE: Currently only LF_UPDATE and INTNL_OVERLAY_UPDATE frames are to
+      //       refresh the LAST_FRAME.
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
     }
 #else
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
@@ -2601,13 +3294,18 @@
     if (use_upsampled_ref)
       uref_cnt_fb(cpi->upsampled_ref_bufs,
                   &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
-
-    if (!cpi->rc.is_src_frame_alt_ref)
+    if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
+    }
 #endif  // CONFIG_EXT_REFS
   }
+
+#if DUMP_REF_FRAME_IMAGES == 1
+  // Dump out all reference frame images.
+  dump_ref_frame_images(cpi);
+#endif  // DUMP_REF_FRAME_IMAGES
 }
 
 static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
@@ -2622,21 +3320,28 @@
 
     aom_usec_timer_start(&timer);
 
+#if CONFIG_LOOP_RESTORATION
+    av1_pick_filter_restoration(cpi->Source, cpi, cpi->sf.lpf_pick);
+#else
     av1_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+#endif  // CONFIG_LOOP_RESTORATION
 
     aom_usec_timer_mark(&timer);
     cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
   }
 
   if (lf->filter_level > 0) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION
+    av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#else
     if (cpi->num_workers > 1)
       av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
                                lf->filter_level, 0, 0, cpi->workers,
                                cpi->num_workers, &cpi->lf_row_sync);
     else
       av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#endif
   }
-
 #if CONFIG_DERING
   if (is_lossless_requested(&cpi->oxcf)) {
     cm->dering_level = 0;
@@ -2695,17 +3400,26 @@
     }
   }
 #endif
+#if CONFIG_LOOP_RESTORATION
+  if (cm->rst_info.restoration_type != RESTORE_NONE) {
+    av1_loop_restoration_init(&cm->rst_internal, &cm->rst_info,
+                              cm->frame_type == KEY_FRAME, cm->width,
+                              cm->height);
+    av1_loop_restoration_rows(cm->frame_to_show, cm, 0, cm->mi_rows, 0);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
 
   aom_extend_frame_inner_borders(cm->frame_to_show);
 }
 
-static INLINE void alloc_frame_mvs(const AV1_COMMON *cm, int buffer_idx) {
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
   RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
   if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
       new_fb_ptr->mi_cols < cm->mi_cols) {
     aom_free(new_fb_ptr->mvs);
-    new_fb_ptr->mvs = (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
-                                           sizeof(*new_fb_ptr->mvs));
+    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+                    (MV_REF *)aom_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
     new_fb_ptr->mi_rows = cm->mi_rows;
     new_fb_ptr->mi_cols = cm->mi_cols;
   }
@@ -2714,7 +3428,7 @@
 void av1_scale_references(AV1_COMP *cpi) {
   AV1_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
-  const AOM_REFFRAME ref_mask[REFS_PER_FRAME] = {
+  const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
     AOM_LAST_FLAG,
 #if CONFIG_EXT_REFS
     AOM_LAST2_FLAG,
@@ -2752,10 +3466,12 @@
         new_fb_ptr = &pool->frame_bufs[new_fb];
         if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
-          aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
-                                   cm->subsampling_x, cm->subsampling_y,
-                                   cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
-                                   cm->byte_alignment, NULL, NULL, NULL);
+          if (aom_realloc_frame_buffer(
+                  &new_fb_ptr->buf, cm->width, cm->height, cm->subsampling_x,
+                  cm->subsampling_y, cm->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+                  cm->byte_alignment, NULL, NULL, NULL))
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
           scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE,
                                  (int)cm->bit_depth);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
@@ -2774,10 +3490,12 @@
         new_fb_ptr = &pool->frame_bufs[new_fb];
         if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
             new_fb_ptr->buf.y_crop_height != cm->height) {
-          aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
-                                   cm->subsampling_x, cm->subsampling_y,
-                                   AOM_BORDER_IN_PIXELS, cm->byte_alignment,
-                                   NULL, NULL, NULL);
+          if (aom_realloc_frame_buffer(&new_fb_ptr->buf, cm->width, cm->height,
+                                       cm->subsampling_x, cm->subsampling_y,
+                                       AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+                                       NULL, NULL, NULL))
+            aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                               "Failed to allocate frame buffer");
           scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
@@ -2828,7 +3546,7 @@
   if (cpi->oxcf.pass == 0) {
     // Only release scaled references under certain conditions:
     // if reference will be updated, or if scaled reference has same resolution.
-    int refresh[3];
+    int refresh[INTER_REFS_PER_FRAME];
     refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
 #if CONFIG_EXT_REFS
     refresh[1] = refresh[2] = 0;
@@ -2852,7 +3570,7 @@
       }
     }
   } else {
-    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i];
       RefCntBuffer *const buf =
           idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
@@ -2875,8 +3593,8 @@
   model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
 }
 
-static void full_to_model_counts(av1_coeff_count_model *model_count,
-                                 av1_coeff_count *full_count) {
+void av1_full_to_model_counts(av1_coeff_count_model *model_count,
+                              av1_coeff_count *full_count) {
   int i, j, k, l;
 
   for (i = 0; i < PLANE_TYPES; ++i)
@@ -2897,7 +3615,7 @@
   recon_err = aom_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d  %10d %10d %d %d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
        "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
        "%10"PRId64" %10"PRId64" %10d "
        "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
@@ -2906,8 +3624,6 @@
         "%10lf %8u %10"PRId64" %10d %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
-        cpi->td.rd_counts.m_search_count,
-        cpi->td.rd_counts.ex_search_count,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
@@ -3068,19 +3784,21 @@
   alloc_frame_mvs(cm, cm->new_fb_idx);
 
   // Reset the frame pointers to the current frame size.
-  aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
+  if (aom_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_AOM_HIGHBITDEPTH
-                           cm->use_highbitdepth,
+                               cm->use_highbitdepth,
 #endif
-                           AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL,
-                           NULL);
+                               AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL,
+                               NULL, NULL))
+    aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+                       "Failed to allocate frame buffer");
 
   alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
     const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
 
     ref_buf->idx = buf_idx;
@@ -3168,6 +3886,14 @@
 
   setup_frame(cpi);
 
+#if CONFIG_ENTROPY
+  cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+  av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
+  cm->coef_probs_update_idx = 0;
+  av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif  // CONFIG_ENTROPY
+
   suppress_active_map(cpi);
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
@@ -3269,6 +3995,41 @@
 
     if (loop_count == 0) setup_frame(cpi);
 
+#if CONFIG_ENTROPY
+    // Base q-index may have changed, so we need to assign proper default coef
+    // probs before every iteration.
+    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+      int i;
+      av1_default_coef_probs(cm);
+      if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+          cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+        for (i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+      } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+        cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+      }
+    }
+#endif  // CONFIG_ENTROPY
+
+#if CONFIG_ENTROPY
+    cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+    if (loop_count == 0 || frame_is_intra_only(cm) ||
+        cm->error_resilient_mode) {
+      av1_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+      av1_copy(cpi->subframe_stats.enc_starting_coef_probs, cm->fc->coef_probs);
+    } else {
+      if (cm->do_subframe_update) {
+        av1_copy(cm->fc->coef_probs,
+                 cpi->subframe_stats.enc_starting_coef_probs);
+        av1_copy(cm->starting_coef_probs,
+                 cpi->subframe_stats.enc_starting_coef_probs);
+        av1_zero(cpi->subframe_stats.coef_counts_buf);
+        av1_zero(cpi->subframe_stats.eob_counts_buf);
+      }
+    }
+    cm->coef_probs_update_idx = 0;
+    av1_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif  // CONFIG_ENTROPY
+
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
     if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -3291,6 +4052,7 @@
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       save_coding_context(cpi);
+
       av1_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
@@ -3496,7 +4258,8 @@
 
 #if CONFIG_EXT_REFS
   // Disable the use of BWDREF_FRAME for non-bipredictive frames.
-  if (!(cpi->rc.is_bipred_frame || cpi->rc.is_last_bipred_frame))
+  if (!(cpi->rc.is_bipred_frame || cpi->rc.is_last_bipred_frame ||
+        (cpi->rc.is_bwd_ref_frame && cpi->num_extra_arfs)))
     flags &= ~AOM_BWD_FLAG;
 #endif  // CONFIG_EXT_REFS
 
@@ -3572,7 +4335,13 @@
 static void set_arf_sign_bias(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
   int arf_sign_bias;
-
+#if CONFIG_EXT_REFS
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  // The arf_sign_bias will be one for internal ARFs'
+  arf_sign_bias = cpi->rc.source_alt_ref_active &&
+                  (!cpi->refresh_alt_ref_frame ||
+                   (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
+#else
   if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     arf_sign_bias = cpi->rc.source_alt_ref_active &&
@@ -3582,6 +4351,8 @@
     arf_sign_bias =
         (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
   }
+#endif  // CONFIG_EXT_REFS
+
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 #if CONFIG_EXT_REFS
   cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
@@ -3590,16 +4361,34 @@
 
 static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
   InterpFilter ifilter;
-  int ref_total[MAX_REF_FRAMES] = { 0 };
+  int ref_total[TOTAL_REFS_PER_FRAME] = { 0 };
   MV_REFERENCE_FRAME ref;
   int mask = 0;
+  int arf_idx = ALTREF_FRAME;
+
+#if CONFIG_EXT_REFS
+  // Get which arf used as ALTREF_FRAME
+  if (cpi->oxcf.pass == 2)
+    arf_idx += cpi->twopass.gf_group.arf_ref_idx[cpi->twopass.gf_group.index];
+#endif  // CONFIG_EXT_REFS
+
   if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
     return mask;
-  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
-    for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+
+#if CONFIG_EXT_REFS
+  for (ref = LAST_FRAME; ref < ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
       ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
 
-  for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+  for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+    ref_total[ref] += cpi->interp_filter_selected[arf_idx][ifilter];
+#else
+  for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
+    for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
+      ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
+#endif  // CONFIG_EXT_REFS
+
+  for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
     if ((ref_total[LAST_FRAME] &&
          cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
 #if CONFIG_EXT_REFS
@@ -3619,13 +4408,69 @@
              ref_total[BWDREF_FRAME]) &&
 #endif  // CONFIG_EXT_REFS
         (ref_total[ALTREF_FRAME] == 0 ||
-         cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50 <
+         cpi->interp_filter_selected[arf_idx][ifilter] * 50 <
              ref_total[ALTREF_FRAME]))
       mask |= 1 << ifilter;
   }
   return mask;
 }
 
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(AV1_COMP *cpi) {
+  AV1_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (recon_buf == NULL || !cm->show_frame) {
+    printf("Frame %d is not ready or no show to dump.\n",
+           cm->current_video_frame);
+    return;
+  }
+
+  if (cm->current_video_frame == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf(
+      "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
+      "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
+      cm->current_video_frame, cpi->twopass.gf_group.index,
+      cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+      cm->show_existing_frame, recon_buf->y_stride, recon_buf->uv_stride,
+      cm->width, cm->height);
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+           f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+           f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
+
 static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
                                       uint8_t *dest,
                                       unsigned int *frame_flags) {
@@ -3633,7 +4478,6 @@
   const AV1EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
-
   set_ext_overrides(cpi);
   aom_clear_system_state();
 
@@ -3650,14 +4494,17 @@
     cpi->ref_frame_flags = get_ref_frame_flags(cpi);
 
   if (cm->show_existing_frame) {
-    // NOTE: The existing frame to show is the current BWDREF_FRAME in the
-    //       reference frame buffer.
-
+    // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
+    //               BWDREF_FRAME in the reference frame buffer.
     cm->frame_type = INTER_FRAME;
     cm->show_frame = 1;
     cpi->frame_flags = *frame_flags;
 
-    cpi->refresh_last_frame = 1;
+    // In the case of show_existing frame, we will not send fresh flag
+    // to decoder. Any change in the reference frame buffer can be done by
+    // switching the virtual indices.
+
+    cpi->refresh_last_frame = 0;
     cpi->refresh_golden_frame = 0;
     cpi->refresh_bwd_ref_frame = 0;
     cpi->refresh_alt_ref_frame = 0;
@@ -3672,18 +4519,10 @@
     // Set up frame to show to get ready for stats collection.
     cm->frame_to_show = get_frame_new_buffer(cm);
 
-#ifdef OUTPUT_YUV_REC
-    // NOTE: For debug - Output the filtered reconstructed video.
-    assert(cm->frame_to_show != NULL);
-    printf(
-        "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
-        "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
-        cm->current_video_frame, cpi->twopass.gf_group.index,
-        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-        cm->show_existing_frame, cm->frame_to_show->y_stride,
-        cm->frame_to_show->uv_stride, cm->width, cm->height);
-    av1_write_yuv_rec_frame(cm);
-#endif  // OUTPUT_YUV_REC
+#if DUMP_RECON_FRAMES == 1
+    // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
 
     // Update the LAST_FRAME in the reference frame buffer.
     av1_update_reference_frames(cpi);
@@ -3698,6 +4537,13 @@
     // Update the frame type
     cm->last_frame_type = cm->frame_type;
 
+    // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+    // to do post-encoding update accordingly.
+    if (cpi->rc.is_src_frame_alt_ref) {
+      av1_set_target_rate(cpi);
+      av1_rc_postencode_update(cpi, *size);
+    }
+
     cm->last_width = cm->width;
     cm->last_height = cm->height;
 
@@ -3732,7 +4578,7 @@
     // By default, encoder assumes decoder can use prev_mi.
     if (cm->error_resilient_mode) {
       cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
+      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
     } else if (cm->intra_only) {
       // Only reset the current context.
       cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
@@ -3773,7 +4619,7 @@
   if (cpi->common.current_video_frame > 1) {
     av1_compute_skin_map(cpi, yuv_skinmap_file);
   }
-#endif
+#endif  // OUTPUT_YUV_SKINMAP
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -3792,7 +4638,9 @@
   }
 
   // If the encoder forced a KEY_FRAME decision
-  if (cm->frame_type == KEY_FRAME) cpi->refresh_last_frame = 1;
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->refresh_last_frame = 1;
+  }
 
   cm->frame_to_show = get_frame_new_buffer(cm);
   cm->frame_to_show->color_space = cm->color_space;
@@ -3808,30 +4656,13 @@
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
-  // build the bitstream
+  // Build the bitstream
   av1_pack_bitstream(cpi, dest, size);
 
-#ifdef OUTPUT_YUV_REC
-  if (cm->show_frame) {
-    // NOTE: For debug - Output the filtered reconstructed video.
-    printf(
-        "\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
-        "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
-        cm->current_video_frame, cpi->twopass.gf_group.index,
-        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
-        cm->show_existing_frame, cm->frame_to_show->y_stride,
-        cm->frame_to_show->uv_stride, cm->width, cm->height);
-    av1_write_yuv_rec_frame(cm);
-  }
-#endif  // OUTPUT_YUV_REC
-
-#if CONFIG_EXT_REFS
-  if (cpi->rc.is_last_bipred_frame) {
-    // NOTE: If the current frame is a LAST_BIPRED_FRAME, next it is needed
-    //       to show the BWDREF_FRAME.
-    cpi->existing_fb_idx_to_show = cpi->bwd_fb_idx;
-  }
-#endif  // CONFIG_EXT_REFS
+#if DUMP_RECON_FRAMES == 1
+  // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+  if (cm->show_frame) dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
 
 #if CONFIG_CLPF
   aom_free(cm->clpf_blocks);
@@ -3843,13 +4674,17 @@
   if (frame_is_intra_only(cm) == 0) {
     release_scaled_references(cpi);
   }
+
   av1_update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->td.counts->coef[t],
-                         cpi->td.rd_counts.coef_counts[t]);
+    av1_full_to_model_counts(cpi->td.counts->coef[t],
+                             cpi->td.rd_counts.coef_counts[t]);
 
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_ENTROPY
+    cm->partial_prob_update = 0;
+#endif  // CONFIG_ENTROPY
     av1_adapt_coef_probs(cm);
     av1_adapt_intra_frame_probs(cm);
   }
@@ -3911,6 +4746,10 @@
   if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame;
 
   if (cm->show_frame) {
+#if CONFIG_EXT_REFS
+// TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
+// being used as reference.
+#endif  // CONFIG_EXT_REFS
     av1_swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
@@ -3939,10 +4778,16 @@
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
 #if CONFIG_EXT_REFS
-  // Donot do the post-encoding update for show_existing_frame==1.
-  if (!cpi->common.show_existing_frame)
-#endif  // CONFIG_EXT_REFS
+  // Do not do post-encoding update for those frames that do not have a spot in
+  // a gf group, but note that an OVERLAY frame always has a spot in a gf group,
+  // even when show_existing_frame is used.
+  if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref) {
     av1_twopass_postencode_update(cpi);
+  }
+  check_show_existing_frame(cpi);
+#else
+  av1_twopass_postencode_update(cpi);
+#endif  // CONFIG_EXT_REFS
 }
 
 static void init_ref_frame_bufs(AV1_COMMON *cm) {
@@ -3989,13 +4834,16 @@
 int av1_receive_raw_frame(AV1_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
                           int64_t end_time) {
-  AV1_COMMON *cm = &cpi->common;
+  AV1_COMMON *const cm = &cpi->common;
   struct aom_usec_timer timer;
   int res = 0;
   const int subsampling_x = sd->subsampling_x;
   const int subsampling_y = sd->subsampling_y;
 #if CONFIG_AOM_HIGHBITDEPTH
   const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+#endif
+
+#if CONFIG_AOM_HIGHBITDEPTH
   check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
 #else
   check_initial_width(cpi, subsampling_x, subsampling_y);
@@ -4036,8 +4884,7 @@
 #if CONFIG_EXT_REFS
          cpi->refresh_bwd_ref_frame ||
 #endif  // CONFIG_EXT_REFS
-         cpi->refresh_alt_ref_frame ||
-         cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF ||
+         cpi->refresh_alt_ref_frame || !cm->error_resilient_mode ||
          cm->lf.mode_ref_delta_update || cm->seg.update_map ||
          cm->seg.update_data;
 }
@@ -4106,7 +4953,7 @@
 
   // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
   //               flag.
-  if (gf_group->brf_pred_enabled[gf_group->index]) {
+  if (gf_group->bidir_pred_enabled[gf_group->index]) {
     if (cpi->oxcf.pass == 2) {
       if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
         brf_src_index = gf_group->brf_src_offset[gf_group->index];
@@ -4124,9 +4971,15 @@
                              const struct lookahead_entry *source) {
   RATE_CONTROL *const rc = &cpi->rc;
 
+  // If pass == 2, the parameters set here will be reset in
+  // av1_rc_get_second_pass_params()
+
   if (cpi->oxcf.pass == 2) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     rc->is_src_frame_alt_ref =
+#if CONFIG_EXT_REFS
+        (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
+#endif  // CONFIG_EXT_REFS
         (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
   } else {
     rc->is_src_frame_alt_ref =
@@ -4294,11 +5147,10 @@
 
   // Normal defaults
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-  cm->refresh_frame_context = oxcf->error_resilient_mode
-                                  ? REFRESH_FRAME_CONTEXT_OFF
-                                  : oxcf->frame_parallel_decoding_mode
-                                        ? REFRESH_FRAME_CONTEXT_FORWARD
-                                        : REFRESH_FRAME_CONTEXT_BACKWARD;
+  cm->refresh_frame_context =
+      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode)
+          ? REFRESH_FRAME_CONTEXT_FORWARD
+          : REFRESH_FRAME_CONTEXT_BACKWARD;
 
   cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
@@ -4316,16 +5168,21 @@
       return -1;
     }
     cpi->Source = &source->img;
-
     // TODO(zoeliu): To track down to determine whether it's needed to adjust
     // the frame rate.
     *time_stamp = source->ts_start;
     *time_end = source->ts_end;
 
+    // We need to adjust frame rate for an overlay frame
+    if (cpi->rc.is_src_frame_alt_ref) {
+      adjust_frame_rate(cpi, source);
+    }
+
     // Find a free buffer for the new frame, releasing the reference previously
     // held.
-    if (cm->new_fb_idx != INVALID_IDX)
+    if (cm->new_fb_idx != INVALID_IDX) {
       --pool->frame_bufs[cm->new_fb_idx].ref_count;
+    }
     cm->new_fb_idx = get_free_fb(cm);
 
     if (cm->new_fb_idx == INVALID_IDX) return -1;
@@ -4336,6 +5193,11 @@
     // Start with a 0 size frame.
     *size = 0;
 
+    // We need to update the gf_group for show_existing overlay frame
+    if (cpi->rc.is_src_frame_alt_ref) {
+      av1_rc_get_second_pass_params(cpi);
+    }
+
     Pass2Encode(cpi, size, dest, frame_flags);
 
     if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
@@ -4349,13 +5211,25 @@
     aom_clear_system_state();
 
     cm->show_existing_frame = 0;
-
     return 0;
   }
 #endif  // CONFIG_EXT_REFS
 
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
+  if (arf_src_index) {
+    for (i = 0; i <= arf_src_index; ++i) {
+      struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+      // Avoid creating an alt-ref if there's a forced keyframe pending.
+      if (e == NULL) {
+        break;
+      } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+        arf_src_index = 0;
+        flush = 1;
+        break;
+      }
+    }
+  }
 
   if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
@@ -4376,10 +5250,8 @@
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
       rc->is_src_frame_alt_ref = 0;
-      rc->source_alt_ref_pending = 0;
-    } else {
-      rc->source_alt_ref_pending = 0;
     }
+    rc->source_alt_ref_pending = 0;
   }
 
 #if CONFIG_EXT_REFS
@@ -4448,9 +5320,7 @@
   aom_clear_system_state();
 
   // adjust frame rates based on timestamps given
-  if (cm->show_frame) {
-    adjust_frame_rate(cpi, source);
-  }
+  if (cm->show_frame) adjust_frame_rate(cpi, source);
 
   // Find a free buffer for the new frame, releasing the reference previously
   // held.
@@ -4463,6 +5333,12 @@
 
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
+#if CONFIG_EXT_REFS
+  if (oxcf->pass == 2) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    cpi->alt_fb_idx = cpi->arf_map[gf_group->arf_ref_idx[gf_group->index]];
+  }
+#else
   if (cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
       init_buffer_indices(cpi);
@@ -4471,6 +5347,7 @@
       cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
     }
   }
+#endif  // CONFIG_EXT_REFS
 
   // Start with a 0 size frame.
   *size = 0;
@@ -4484,7 +5361,8 @@
   }
 
   if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
-    for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
+    for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i)
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
 
 #if CONFIG_AOM_QM
@@ -4503,7 +5381,7 @@
     Pass0Encode(cpi, size, dest, frame_flags);
   }
 
-  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
+  if (!cm->error_resilient_mode)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 
   // No frame encoded, or frame was dropped, release scaled references.
@@ -4526,27 +5404,15 @@
     compute_internal_stats(cpi);
     cpi->bytes += (int)(*size);
   }
-#endif
+#endif  // CONFIG_INTERNAL_STATS
 
   aom_clear_system_state();
 
-#if CONFIG_EXT_REFS
-  if (cpi->rc.is_last_bipred_frame) {
-    // NOTE(zoeliu): If the current frame is a last bi-predictive frame, it is
-    //               needed next to show the BWDREF_FRAME.
-    cpi->rc.is_last_bipred_frame = 0;
-    cm->show_existing_frame = 1;
-  } else {
-    cm->show_existing_frame = 0;
-  }
-#endif  // CONFIG_EXT_REFS
-
   return 0;
 }
 
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
   AV1_COMMON *cm = &cpi->common;
-
   if (!cm->show_frame) {
     return -1;
   } else {
@@ -4566,6 +5432,14 @@
   }
 }
 
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+  if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1;
+
+  *frame =
+      cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+  return 0;
+}
+
 int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
                           AOM_SCALING vert_mode) {
   AV1_COMMON *cm = &cpi->common;
@@ -4622,11 +5496,8 @@
 int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
 
 void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
-  if (flags & (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF |
-#if CONFIG_EXT_REFS
-               AOM_EFLAG_NO_REF_BRF |
-#endif  // CONFIG_EXT_REFS
-               AOM_EFLAG_NO_REF_ARF)) {
+  if (flags &
+      (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF)) {
     int ref = AOM_REFFRAME_ALL;
 
     if (flags & AOM_EFLAG_NO_REF_LAST) {
@@ -4639,24 +5510,14 @@
 
     if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
 
-#if CONFIG_EXT_REFS
-    if (flags & AOM_EFLAG_NO_REF_BRF) ref ^= AOM_BWD_FLAG;
-#endif  // CONFIG_EXT_REFS
-
     if (flags & AOM_EFLAG_NO_REF_ARF) ref ^= AOM_ALT_FLAG;
 
     av1_use_as_reference(cpi, ref);
   }
 
-  if (flags & (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF |
-#if CONFIG_EXT_REFS
-               AOM_EFLAG_NO_UPD_BRF |
-#endif  // CONFIG_EXT_REFS
-               AOM_EFLAG_NO_UPD_ARF | AOM_EFLAG_FORCE_GF |
-#if CONFIG_EXT_REFS
-               AOM_EFLAG_FORCE_BRF |
-#endif  // CONFIG_EXT_REFS
-               AOM_EFLAG_FORCE_ARF)) {
+  if (flags &
+      (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF |
+       AOM_EFLAG_FORCE_GF | AOM_EFLAG_FORCE_ARF)) {
     int upd = AOM_REFFRAME_ALL;
 
     if (flags & AOM_EFLAG_NO_UPD_LAST) {

diff --git a/av1/encoder/encoder.h b/av1/encoder/encoder.h
index daa90b3..ae48474 100644
--- a/av1/encoder/encoder.h
+++ b/av1/encoder/encoder.h

@@ -21,8 +21,10 @@
 #include "av1/common/entropymode.h"
 #include "av1/common/thread_common.h"
 #include "av1/common/onyxc_int.h"
-
 #include "av1/encoder/aq_cyclicrefresh.h"
+#if CONFIG_ANS
+#include "aom_dsp/buf_ans.h"
+#endif
 #include "av1/encoder/context_tree.h"
 #include "av1/encoder/encodemb.h"
 #include "av1/encoder/firstpass.h"
@@ -34,10 +36,8 @@
 #include "av1/encoder/rd.h"
 #include "av1/encoder/speed_features.h"
 #include "av1/encoder/tokenize.h"
+#include "av1/encoder/variance_tree.h"
 
-#if CONFIG_ANS
-#include "aom_dsp/buf_ans.h"
-#endif  // CONFIG_ANS
 #if CONFIG_INTERNAL_STATS
 #include "aom_dsp/ssim.h"
 #endif
@@ -49,12 +49,6 @@
 extern "C" {
 #endif
 
-// av1_update_reference_frames relies on there being 1 more buffer available
-// than the number of used references so a new buffer can always be allocated.
-// Therefore when multi arf mode is enabled, 1 more buffer is required then
-// MAX_REF_FRAMES.
-#define MAX_UPSAMPLED_BUFS (MAX_REF_FRAMES + 1)
-
 typedef struct {
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -67,7 +61,7 @@
 #endif
 
   // 0 = Intra, Last, GF, ARF
-  signed char last_ref_lf_deltas[MAX_REF_FRAMES];
+  signed char last_ref_lf_deltas[TOTAL_REFS_PER_FRAME];
   // 0 = ZERO_MV, MV
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
@@ -86,6 +80,8 @@
 #if CONFIG_EXT_REFS
   // backward reference frame
   BRF_FRAME = 4,
+  // extra alternate reference frame
+  EXT_ARF_FRAME = 5
 #endif
 } FRAME_CONTEXT_INDEX;
 
@@ -147,7 +143,7 @@
   int height;                    // height of data passed to the compressor
   unsigned int input_bit_depth;  // Input bit depth.
   double init_framerate;         // set to passed in framerate
-  int64_t target_bandwidth;      // bandwidth to be used in kilobits per second
+  int64_t target_bandwidth;      // bandwidth to be used in bits per second
 
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;          // sharpening output: recommendation 0:
@@ -214,6 +210,9 @@
   // ----------------------------------------------------------------
 
   int enable_auto_arf;
+#if CONFIG_EXT_REFS
+  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
+#endif                  // CONFIG_EXT_REFS
 
   /* Bitfield defining the error resiliency features to enable.
    * Can provide decodable frames after losses in previous
@@ -254,6 +253,10 @@
   int color_range;
   int render_width;
   int render_height;
+
+#if CONFIG_EXT_PARTITION
+  aom_superblock_size_t superblock_size;
+#endif  // CONFIG_EXT_PARTITION
 } AV1EncoderConfig;
 
 static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -265,6 +268,8 @@
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
   int mode_map[BLOCK_SIZES][MAX_MODES];
+  int m_search_count;
+  int ex_search_count;
 #if CONFIG_PVQ
   PVQ_QUEUE pvq_q;
 #endif
@@ -273,8 +278,6 @@
 typedef struct RD_COUNTS {
   av1_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
-  int m_search_count;
-  int ex_search_count;
 } RD_COUNTS;
 
 typedef struct ThreadData {
@@ -284,7 +287,10 @@
 
   PICK_MODE_CONTEXT *leaf_tree;
   PC_TREE *pc_tree;
-  PC_TREE *pc_root;
+  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+
+  VAR_TREE *var_tree;
+  VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
 } ThreadData;
 
 struct EncWorkerData;
@@ -295,24 +301,47 @@
   unsigned char *map;
 } ActiveMap;
 
-typedef enum { Y, U, V, ALL } STAT_TYPE;
+#define NUM_STAT_TYPES 4  // types of stats: Y, U, V and ALL
 
 typedef struct IMAGE_STAT {
-  double stat[ALL + 1];
+  double stat[NUM_STAT_TYPES];
   double worst;
 } ImageStat;
 
+#undef NUM_STAT_TYPES
+
 typedef struct {
   int ref_count;
   YV12_BUFFER_CONFIG buf;
 } EncRefCntBuffer;
 
+#if CONFIG_ENTROPY
+typedef struct SUBFRAME_STATS {
+  av1_coeff_probs_model coef_probs_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+  av1_coeff_count coef_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+  unsigned int eob_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES][REF_TYPES]
+                             [COEF_BANDS][COEFF_CONTEXTS];
+  av1_coeff_probs_model enc_starting_coef_probs[TX_SIZES][PLANE_TYPES];
+} SUBFRAME_STATS;
+#endif  // CONFIG_ENTROPY
+
+typedef struct TileBufferEnc {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
 typedef struct AV1_COMP {
   QUANTS quants;
   ThreadData td;
   MB_MODE_INFO_EXT *mbmi_ext_base;
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+#endif  // CONFIG_NEW_QUANT
   AV1_COMMON common;
   AV1EncoderConfig oxcf;
   struct lookahead_ctx *lookahead;
@@ -326,16 +355,16 @@
   YV12_BUFFER_CONFIG scaled_last_source;
 
   // Up-sampled reference buffers
-  EncRefCntBuffer upsampled_ref_bufs[MAX_UPSAMPLED_BUFS];
-  int upsampled_ref_idx[MAX_UPSAMPLED_BUFS];
-
-  TileDataEnc *tile_data;
-  int allocated_tiles;  // Keep track of memory allocated for tiles.
+  // NOTE(zoeliu): It is needed to allocate sufficient space to the up-sampled
+  // reference buffers, which should include the up-sampled version of all the
+  // possibly stored references plus the currently coded frame itself.
+  EncRefCntBuffer upsampled_ref_bufs[REF_FRAMES + 1];
+  int upsampled_ref_idx[REF_FRAMES + 1];
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
-  int scaled_ref_idx[MAX_REF_FRAMES];
+  int scaled_ref_idx[TOTAL_REFS_PER_FRAME];
 #if CONFIG_EXT_REFS
   int lst_fb_idxes[LAST_REF_FRAMES];
 #else
@@ -347,6 +376,8 @@
 #endif             // CONFIG_EXT_REFS
   int alt_fb_idx;
 
+  int last_show_frame_buf_idx;  // last show frame buffer index
+
   int refresh_last_frame;
   int refresh_golden_frame;
 #if CONFIG_EXT_REFS
@@ -363,9 +394,9 @@
   int ext_refresh_frame_context;
 
   YV12_BUFFER_CONFIG last_frame_uf;
-
-  TOKENEXTRA *tile_tok[4][1 << 6];
-  unsigned int tok_count[4][1 << 6];
+#if CONFIG_LOOP_RESTORATION
+  YV12_BUFFER_CONFIG last_frame_db;
+#endif  // CONFIG_LOOP_RESTORATION
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
@@ -391,7 +422,10 @@
   RATE_CONTROL rc;
   double framerate;
 
-  int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
+  // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter
+  // references; Plus the currently coded frame itself, it is needed to allocate
+  // sufficient space to the size of the maximum possible number of frames.
+  int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE];
 
   struct aom_codec_pkt_list *output_pkt_list;
 
@@ -407,13 +441,13 @@
 
   int allow_comp_inter_inter;
 
-  unsigned char *segmentation_map;
+  uint8_t *segmentation_map;
 
   CYCLIC_REFRESH *cyclic_refresh;
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  av1_full_search_fn_t full_search_sad;
+  av1_full_search_fn_t full_search_sad;  // It is currently unused.
   av1_diamond_search_fn_t diamond_search_sad;
   aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
@@ -478,23 +512,34 @@
 
   search_site_config ss_cfg;
 
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
 #if CONFIG_REF_MV
   int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
   int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
   int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
-  int drl_mode_cost[DRL_MODE_CONTEXTS][2];
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+#if CONFIG_EXT_INTER
+  int new2mv_mode_cost[2];
+#endif  // CONFIG_EXT_INTER
 #endif
 
-  int mbmode_cost[INTRA_MODES];
   unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
-#if CONFIG_MOTION_VAR
+#if CONFIG_EXT_INTER
+  unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
+                                       [INTER_COMPOUND_MODES];
+  unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int motion_mode_cost[BLOCK_SIZES][MOTION_MODES];
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
-
+#endif
 #if CONFIG_PALETTE
   int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
   int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
@@ -503,13 +548,33 @@
   int palette_uv_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
                            [PALETTE_COLORS];
 #endif  // CONFIG_PALETTE
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_EXT_TX
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                         [TX_TYPES];
+#else
+  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+  int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_LOOP_RESTORATION
+  int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+#endif  // CONFIG_LOOP_RESTORATION
 
   int multi_arf_allowed;
   int multi_arf_enabled;
   int multi_arf_last_grp_enabled;
 
-  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
-  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
+
+  TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
 
   int resize_pending;
   int resize_state;
@@ -520,26 +585,43 @@
   int resize_count;
 
   // VAR_BASED_PARTITION thresholds
-  // 0 - threshold_64x64; 1 - threshold_32x32;
-  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
-  int64_t vbp_thresholds[4];
+  // 0 - threshold_128x128;
+  // 1 - threshold_64x64;
+  // 2 - threshold_32x32;
+  // 3 - threshold_16x16;
+  // 4 - threshold_8x8;
+  int64_t vbp_thresholds[5];
   int64_t vbp_threshold_minmax;
   int64_t vbp_threshold_sad;
   BLOCK_SIZE vbp_bsize_min;
 
-#if CONFIG_EXT_REFS
-  int refresh_frame_mask;
-  int existing_fb_idx_to_show;
-#endif  // CONFIG_EXT_REFS
+  // VARIANCE_AQ segment map refresh
+  int vaq_refresh;
 
   // Multi-threading
   int num_workers;
   AVxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   AV1LfSync lf_row_sync;
+#if CONFIG_ENTROPY
+  SUBFRAME_STATS subframe_stats;
+  // TODO(yaowu): minimize the size of count buffers
+  SUBFRAME_STATS wholeframe_stats;
+  av1_coeff_stats branch_ct_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+#endif  // CONFIG_ENTROPY
 #if CONFIG_ANS
   struct BufAnsCoder buf_ans;
-#endif  // CONFIG_ANS
+#endif
+#if CONFIG_EXT_REFS
+  int refresh_frame_mask;
+  int existing_fb_idx_to_show;
+  int is_arf_filter_off[MAX_EXT_ARFS + 1];
+  int num_extra_arfs;
+  int arf_map[MAX_EXT_ARFS + 1];
+#endif  // CONFIG_EXT_REFS
+#if CONFIG_GLOBAL_MOTION
+  int global_motion_used[TOTAL_REFS_PER_FRAME];
+#endif
 } AV1_COMP;
 
 void av1_initialize_enc(void);
@@ -562,6 +644,8 @@
 
 int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
 
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
 int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
 
 void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
@@ -586,6 +670,9 @@
 
 int av1_get_quantizer(struct AV1_COMP *cpi);
 
+void av1_full_to_model_counts(av1_coeff_count_model *model_count,
+                              av1_coeff_count *full_count);
+
 static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
@@ -609,7 +696,8 @@
     return cpi->alt_fb_idx;
 }
 
-static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi, int ref_frame) {
+static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
   const AV1_COMMON *const cm = &cpi->common;
   const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
@@ -644,7 +732,7 @@
 }
 #endif  // CONFIG_EXT_REFS
 
-static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
   // We assume 3 planes all at full resolution. We assume up to 1 token per
   // pixel, and then allow a head room of 1 EOSB token per 4x4 block per plane,
   // plus EOSB_TOKEN per plane.
@@ -653,7 +741,7 @@
 
 // Get the allocated token size for a tile. It does the same calculation as in
 // the frame token allocation.
-static INLINE int allocated_tokens(TileInfo tile) {
+static INLINE unsigned int allocated_tokens(TileInfo tile) {
   int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
   int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
 
@@ -683,6 +771,16 @@
          cpi->oxcf.enable_auto_arf;
 }
 
+// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+#if 0 && CONFIG_EXT_REFS
+static INLINE int is_bwdref_enabled(const AV1_COMP *const cpi) {
+  // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
+  //               alt_ref, and now will be off when the alt_ref interval is
+  //               not sufficiently large.
+  return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
+}
+#endif  // CONFIG_EXT_REFS
+
 static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {

diff --git a/av1/encoder/ethread.c b/av1/encoder/ethread.c
index 47e286f..5876d15 100644
--- a/av1/encoder/ethread.c
+++ b/av1/encoder/ethread.c

@@ -1,12 +1,11 @@
 /*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include "av1/encoder/encodeframe.h"
@@ -28,17 +27,13 @@
             for (n = 0; n < ENTROPY_TOKENS; n++)
               td->rd_counts.coef_counts[i][j][k][l][m][n] +=
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
-
-  // Counts of all motion searches and exhuastive mesh searches.
-  td->rd_counts.m_search_count += td_t->rd_counts.m_search_count;
-  td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count;
 }
 
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
   AV1_COMP *const cpi = thread_data->cpi;
   const AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   int t;
 
   (void)unused;
@@ -56,7 +51,7 @@
 
 void av1_encode_tiles_mt(AV1_COMP *cpi) {
   AV1_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_cols = cm->tile_cols;
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
   const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
@@ -65,24 +60,22 @@
 
   // Only run once to create threads and allocate thread data.
   if (cpi->num_workers == 0) {
-    int allocated_workers = num_workers;
-
     CHECK_MEM_ERROR(cm, cpi->workers,
-                    aom_malloc(allocated_workers * sizeof(*cpi->workers)));
+                    aom_malloc(num_workers * sizeof(*cpi->workers)));
 
     CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    aom_calloc(allocated_workers, sizeof(*cpi->tile_thr_data)));
+                    aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
-    for (i = 0; i < allocated_workers; i++) {
+    for (i = 0; i < num_workers; i++) {
       AVxWorker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+      EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
       ++cpi->num_workers;
       winterface->init(worker);
 
-      if (i < allocated_workers - 1) {
-        thread_data->cpi = cpi;
+      thread_data->cpi = cpi;
 
+      if (i < num_workers - 1) {
         // Allocate thread data.
         CHECK_MEM_ERROR(cm, thread_data->td,
                         aom_memalign(32, sizeof(*thread_data->td)));
@@ -93,6 +86,10 @@
         thread_data->td->pc_tree = NULL;
         av1_setup_pc_tree(cm, thread_data->td);
 
+        // Set up variance tree if needed.
+        if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+          av1_setup_var_tree(cm, &cpi->td);
+
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         aom_calloc(1, sizeof(*thread_data->td->counts)));
@@ -103,7 +100,6 @@
                              "Tile encoder thread creation failed");
       } else {
         // Main thread acts as a worker and uses the thread data in cpi.
-        thread_data->cpi = cpi;
         thread_data->td = &cpi->td;
       }
 
@@ -166,7 +162,7 @@
 
     // Accumulate counters.
     if (i < cpi->num_workers - 1) {
-      av1_accumulate_frame_counts(cm, thread_data->td->counts, 0);
+      av1_accumulate_frame_counts(cm, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
     }
   }

diff --git a/av1/encoder/firstpass.c b/av1/encoder/firstpass.c
index 9b86005..1ea28f2 100644
--- a/av1/encoder/firstpass.c
+++ b/av1/encoder/firstpass.c

@@ -27,7 +27,6 @@
 #include "av1/common/entropymv.h"
 #include "av1/common/quant_common.h"
 #include "av1/common/reconinter.h"  // av1_setup_dst_planes()
-#include "av1/common/scan.h"
 #include "av1/encoder/aq_variance.h"
 #include "av1/encoder/block.h"
 #include "av1/encoder/encodeframe.h"
@@ -47,7 +46,6 @@
 
 #define BOOST_BREAKOUT 12.5
 #define BOOST_FACTOR 12.5
-#define ERR_DIVISOR 128.0
 #define FACTOR_PT_LOW 0.70
 #define FACTOR_PT_HIGH 0.90
 #define FIRST_PASS_Q 10.0
@@ -215,6 +213,13 @@
   section->duration -= frame->duration;
 }
 
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0  // 1920x1080
+static double get_linear_size_factor(const AV1_COMP *cpi) {
+  const double this_area = cpi->initial_width * cpi->initial_height;
+  return pow(this_area / BASE_SIZE, 0.5);
+}
+
 // Calculate an active area of the image that discounts formatting
 // bars and partially discounts other 0 energy areas.
 #define MIN_ACTIVE_AREA 0.5
@@ -454,7 +459,8 @@
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  const PICK_MODE_CONTEXT *ctx =
+      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
   int i;
 
   int recon_yoffset, recon_uvoffset;
@@ -486,6 +492,7 @@
   double intra_factor;
   double brightness_factor;
   BufferPool *const pool = cm->buffer_pool;
+  const int qindex = find_fp_qindex(cm->bit_depth);
 #if CONFIG_PVQ
   PVQ_QUEUE pvq_q;
 #endif
@@ -507,7 +514,7 @@
   neutral_count = 0.0;
 
   set_first_pass_params(cpi);
-  av1_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
+  av1_set_quantizer(cm, qindex);
 
   av1_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
@@ -612,13 +619,20 @@
       set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
                      mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
                      cm->mi_rows, cm->mi_cols);
+      set_plane_n4(xd, num_8x8_blocks_wide_lookup[bsize],
+                   num_8x8_blocks_high_lookup[bsize],
+                   mi_width_log2_lookup[bsize], mi_height_log2_lookup[bsize]);
 
       // Do intra 16x16 prediction.
       xd->mi[0]->mbmi.segment_id = 0;
+#if CONFIG_SUPERTX
+      xd->mi[0]->mbmi.segment_id_supertx = 0;
+#endif  // CONFIG_SUPERTX
+      xd->lossless[xd->mi[0]->mbmi.segment_id] = (qindex == 0);
       xd->mi[0]->mbmi.mode = DC_PRED;
       xd->mi[0]->mbmi.tx_size =
           use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      av1_encode_intra_block_plane(cm, x, bsize, 0);
+      av1_encode_intra_block_plane(cm, x, bsize, 0, 0);
       this_error = aom_get_mb_ss(x->plane[0].src_diff);
 
       // Keep a record of blocks that have almost no intra error residual
@@ -1120,11 +1134,7 @@
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-// Larger image formats are expected to be a little harder to code relatively
-// given the same prediction error score. This in part at least relates to the
-// increased size and hence coding cost of motion vectors.
-#define EDIV_SIZE_FACTOR 800
-
+#define ERR_DIVISOR 100.0
 static int get_twopass_worst_quality(const AV1_COMP *cpi,
                                      const double section_err,
                                      double inactive_zone,
@@ -1144,12 +1154,22 @@
     const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
     const double av_err_per_mb = section_err / active_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    const double ediv_size_correction = (double)num_mbs / EDIV_SIZE_FACTOR;
+    double ediv_size_correction;
     const int target_norm_bits_per_mb =
         ((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) / active_mbs;
-
     int q;
 
+    // Larger image formats are expected to be a little harder to code
+    // relatively given the same prediction error score. This in part at
+    // least relates to the increased size and hence coding overheads of
+    // motion vectors. Some account of this is made through adjustment of
+    // the error divisor.
+    ediv_size_correction =
+        AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
+    if (ediv_size_correction < 1.0)
+      ediv_size_correction = -(1.0 / ediv_size_correction);
+    ediv_size_correction *= 4.0;
+
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
@@ -1582,6 +1602,7 @@
                 0);
 }
 
+#if !CONFIG_EXT_REFS
 // Current limit on maximum number of active arfs in a GF/ARF group.
 #define MAX_ACTIVE_ARFS 2
 #define ARF_SLOT1 2
@@ -1593,6 +1614,7 @@
   arf_buffer_indices[0] = ARF_SLOT1;
   arf_buffer_indices[1] = ARF_SLOT2;
 }
+#endif
 
 static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
                                    double group_error, int gf_arf_bits) {
@@ -1610,9 +1632,6 @@
   double modified_err = 0.0;
   double err_fraction;
   int mid_boost_bits = 0;
-  int mid_frame_idx;
-  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
-
 #if CONFIG_EXT_REFS
   // The use of bi-predictive frames are only enabled when following 3
   // conditions are met:
@@ -1626,11 +1645,29 @@
           (rc->baseline_gf_interval - rc->source_alt_ref_pending);
   int bipred_group_end = 0;
   int bipred_frame_index = 0;
+
+  int arf_pos[MAX_EXT_ARFS + 1];
+  const unsigned char ext_arf_interval =
+      (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
+  int which_arf = cpi->num_extra_arfs;
+  int subgroup_interval[MAX_EXT_ARFS + 1];
+  int ext_arf_boost[MAX_EXT_ARFS];
+  int is_sg_bipred_enabled = is_bipred_enabled;
+  int accumulative_subgroup_interval = 0;
+#else
+  int mid_frame_idx;
+  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+  av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
 #endif  // CONFIG_EXT_REFS
 
   key_frame = cpi->common.frame_type == KEY_FRAME;
 
+#if !CONFIG_EXT_REFS
   get_arf_buffer_indices(arf_buffer_indices);
+#endif  // !CONFIG_EXT_REFS
 
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
@@ -1644,16 +1681,20 @@
       gf_group->rf_level[frame_index] = GF_ARF_STD;
       gf_group->bit_allocation[frame_index] = gf_arf_bits;
     }
+#if CONFIG_EXT_REFS
+    gf_group->arf_update_idx[frame_index] = 0;
+    gf_group->arf_ref_idx[frame_index] = 0;
+#else
     gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
     gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-
+#endif  // CONFIG_EXT_REFS
     // Step over the golden frame / overlay frame
     if (EOF == input_stats(twopass, &frame_stats)) return;
   }
 
 #if CONFIG_EXT_REFS
+  gf_group->bidir_pred_enabled[frame_index] = 0;
   gf_group->brf_src_offset[frame_index] = 0;
-  gf_group->brf_pred_enabled[frame_index] = 0;
 #endif  // CONFIG_EXT_REFS
 
   // Deduct the boost bits for arf (or gf if it is not a key frame)
@@ -1661,6 +1702,7 @@
   if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
 
   frame_index++;
+
 #if CONFIG_EXT_REFS
   bipred_frame_index++;
 #endif  // CONFIG_EXT_REFS
@@ -1674,18 +1716,55 @@
     gf_group->arf_src_offset[frame_index] =
         (unsigned char)(rc->baseline_gf_interval - 1);
 
+#if CONFIG_EXT_REFS
+    gf_group->arf_update_idx[frame_index] = 0;
+    gf_group->arf_ref_idx[frame_index] = 0;
+
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+    gf_group->brf_src_offset[frame_index] = 0;
+// NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+#else
     gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
     gf_group->arf_ref_idx[frame_index] =
         arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                            rc->source_alt_ref_active];
+#endif  // CONFIG_EXT_REFS
+
 #if CONFIG_EXT_REFS
-    gf_group->brf_src_offset[frame_index] = 0;
-    gf_group->brf_pred_enabled[frame_index] = 0;
-// NOTE: "bipred_frame_index" stays unchanged for ARF_UPDATE frames.
+    // Work out the ARFs' positions in this gf group
+    // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
+    // order (except for the original ARF). In the example of three ALT_REF's,
+    // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
+    // but code them in the following order:
+    // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
+    arf_pos[0] =
+        frame_index + cpi->num_extra_arfs + gf_group->arf_src_offset[1] + 1;
+    for (i = 0; i < cpi->num_extra_arfs; ++i) {
+      arf_pos[i + 1] =
+          frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
+      subgroup_interval[i] = arf_pos[i] - arf_pos[i + 1] - (i == 0 ? 1 : 2);
+    }
+    subgroup_interval[cpi->num_extra_arfs] = arf_pos[cpi->num_extra_arfs] -
+                                             frame_index -
+                                             (cpi->num_extra_arfs == 0 ? 1 : 2);
 #endif  // CONFIG_EXT_REFS
 
     ++frame_index;
 
+#if CONFIG_EXT_REFS
+    // Insert an extra ARF
+    if (cpi->num_extra_arfs) {
+      gf_group->update_type[frame_index] = ARF_UPDATE;
+      // Note (weitinglin): GF_ARF_LOW is also used as an identifier
+      //                    for internal ALT_REF's:
+      gf_group->rf_level[frame_index] = GF_ARF_LOW;
+      gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+      gf_group->arf_update_idx[frame_index] = which_arf;
+      gf_group->arf_ref_idx[frame_index] = 0;
+      ++frame_index;
+    }
+    accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
+#else
     if (cpi->multi_arf_enabled) {
       // Set aside a slot for a level 1 arf.
       gf_group->update_type[frame_index] = ARF_UPDATE;
@@ -1696,14 +1775,20 @@
       gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
       ++frame_index;
     }
+#endif  // CONFIG_EXT_ARFS
   }
 
+#if !CONFIG_EXT_REFS
   // Define middle frame
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
+#endif  // !CONFIG_EXT_REFS
 
   // Allocate bits to the other frames in the group.
   for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+#if !CONFIG_EXT_REFS
     int arf_idx = 0;
+#endif  // !CONFIG_EXT_REFS
+
     if (EOF == input_stats(twopass, &frame_stats)) break;
 
     modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
@@ -1718,55 +1803,71 @@
     if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
       mid_boost_bits += (target_frame_size >> 4);
       target_frame_size -= (target_frame_size >> 4);
-
+#if !CONFIG_EXT_REFS
       if (frame_index <= mid_frame_idx) arf_idx = 1;
+#endif  // !CONFIG_EXT_REFS
     }
+
+#if CONFIG_EXT_REFS
+    gf_group->arf_update_idx[frame_index] = which_arf;
+    gf_group->arf_ref_idx[frame_index] = which_arf;
+#else
     gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
     gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
+#endif  // CONFIG_EXT_REFS
 
     target_frame_size =
         clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
 
 #if CONFIG_EXT_REFS
-    // NOTE: Bi-predictive frames are only enabled when the length of the
-    //       bi-predictive frame group interval is strictly smaller than that
-    //       of the golden frame group interval.
-    // TODO(zoeliu): Currently bi-prediction is only enabled when alt-ref is on.
-    if (is_bipred_enabled && !bipred_group_end) {
+    // If we are going to have ARFs, check if we can have BWDREF in this
+    // subgroup.
+    if (rc->source_alt_ref_pending) {
+      is_sg_bipred_enabled =
+          is_bipred_enabled &&
+          (subgroup_interval[which_arf] > rc->bipred_group_interval);
+    }
+
+    // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
+    //       frame group interval is strictly smaller than that of the GOLDEN
+    //       FRAME group interval.
+    // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
+    if (is_sg_bipred_enabled && !bipred_group_end) {
       const int cur_brf_src_offset = rc->bipred_group_interval - 1;
 
       // --- BRF_UPDATE ---
       if (bipred_frame_index == 1) {
         gf_group->update_type[frame_index] = BRF_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
         gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
-        gf_group->brf_pred_enabled[frame_index] = 1;
         // --- LAST_BIPRED_UPDATE ---
       } else if (bipred_frame_index == rc->bipred_group_interval) {
         gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
         gf_group->brf_src_offset[frame_index] = 0;
-        gf_group->brf_pred_enabled[frame_index] = 1;
         // Reset the bi-predictive frame index.
         bipred_frame_index = 0;
         // --- BIPRED_UPDATE ---
       } else {
         gf_group->update_type[frame_index] = BIPRED_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
         gf_group->brf_src_offset[frame_index] = 0;
-        gf_group->brf_pred_enabled[frame_index] = 1;
       }
 
       bipred_frame_index++;
       // Check whether the next bi-predictive frame group would entirely be
       // included within the current golden frame group.
+      // In addition, we need to avoid coding a BRF right before an ARF.
       if (bipred_frame_index == 1 &&
-          (i + 1 + cur_brf_src_offset) >=
-              (rc->baseline_gf_interval - rc->source_alt_ref_pending))
+          (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) {
         bipred_group_end = 1;
+      }
     } else {
 #endif  // CONFIG_EXT_REFS
       gf_group->update_type[frame_index] = LF_UPDATE;
 #if CONFIG_EXT_REFS
+      gf_group->bidir_pred_enabled[frame_index] = 0;
       gf_group->brf_src_offset[frame_index] = 0;
-      gf_group->brf_pred_enabled[frame_index] = 0;
     }
 #endif  // CONFIG_EXT_REFS
 
@@ -1795,19 +1896,55 @@
 #endif  // CONFIG_EXT_REFS
 
     ++frame_index;
+
+#if CONFIG_EXT_REFS
+    // Check if we need to update the ARF
+    if (cpi->num_extra_arfs && frame_index > arf_pos[which_arf]) {
+      --which_arf;
+      accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
+      // Meet the new subgroup. Reset the bipred_group_end flag;
+      bipred_group_end = 0;
+      // Insert another extra ARF after the overlay frame
+      if (which_arf) {
+        gf_group->update_type[frame_index] = ARF_UPDATE;
+        gf_group->rf_level[frame_index] = GF_ARF_LOW;
+        gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+        gf_group->arf_update_idx[frame_index] = which_arf;
+        gf_group->arf_ref_idx[frame_index] = 0;
+        ++frame_index;
+      }
+    }
+#endif  // CONFIG_EXT_REFS
   }
 
-  // Note:
-  // We need to configure the frame at the end of the sequence + 1 that will be
-  // the start frame for the next group. Otherwise prior to the call to
-  // av1_rc_get_second_pass_params() the data will be undefined.
+// Note:
+// We need to configure the frame at the end of the sequence + 1 that will be
+// the start frame for the next group. Otherwise prior to the call to
+// av1_rc_get_second_pass_params() the data will be undefined.
+#if CONFIG_EXT_REFS
+  gf_group->arf_update_idx[frame_index] = 0;
+  gf_group->arf_ref_idx[frame_index] = 0;
+#else
   gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
   gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
+#endif  // CONFIG_EXT_REFS
 
   if (rc->source_alt_ref_pending) {
     gf_group->update_type[frame_index] = OVERLAY_UPDATE;
     gf_group->rf_level[frame_index] = INTER_NORMAL;
 
+#if CONFIG_EXT_REFS
+    if (cpi->num_extra_arfs) {
+      for (i = cpi->num_extra_arfs; i > 0; --i) {
+        int arf_pos_in_gf = (i == cpi->num_extra_arfs ? 2 : arf_pos[i + 1] + 1);
+        gf_group->bit_allocation[arf_pos_in_gf] =
+            gf_group->bit_allocation[arf_pos[i]];
+        gf_group->update_type[arf_pos[i]] = INTNL_OVERLAY_UPDATE;
+        gf_group->bit_allocation[arf_pos[i]] = 0;
+        gf_group->rf_level[arf_pos[i]] = INTER_LOW;
+      }
+    }
+#else
     // Final setup for second arf and its overlay.
     if (cpi->multi_arf_enabled) {
       gf_group->bit_allocation[2] =
@@ -1815,13 +1952,15 @@
       gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
       gf_group->bit_allocation[mid_frame_idx] = 0;
     }
+#endif  // CONFIG_EXT_REFS
   } else {
     gf_group->update_type[frame_index] = GF_UPDATE;
     gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
+
 #if CONFIG_EXT_REFS
+  gf_group->bidir_pred_enabled[frame_index] = 0;
   gf_group->brf_src_offset[frame_index] = 0;
-  gf_group->brf_pred_enabled[frame_index] = 0;
 #endif  // CONFIG_EXT_REFS
 
   // Note whether multi-arf was enabled this group for next time.
@@ -1911,6 +2050,7 @@
                                                   cpi->common.bit_depth));
     int int_lbq = (int)(av1_convert_qindex_to_q(rc->last_boosted_qindex,
                                                 cpi->common.bit_depth));
+
     active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
     if (active_min_gf_interval > rc->max_gf_interval)
       active_min_gf_interval = rc->max_gf_interval;
@@ -1923,13 +2063,12 @@
       // At high Q when there are few bits to spare we are better with a longer
       // interval to spread the cost of the GF.
       active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
-      if (active_max_gf_interval < active_min_gf_interval)
-        active_max_gf_interval = active_min_gf_interval;
 
-      if (active_max_gf_interval > rc->max_gf_interval)
-        active_max_gf_interval = rc->max_gf_interval;
+      // We have: active_min_gf_interval <= rc->max_gf_interval
       if (active_max_gf_interval < active_min_gf_interval)
         active_max_gf_interval = active_min_gf_interval;
+      else if (active_max_gf_interval > rc->max_gf_interval)
+        active_max_gf_interval = rc->max_gf_interval;
     }
   }
 
@@ -2030,13 +2169,20 @@
   // Set the interval until the next gf.
   rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
 
+#if CONFIG_EXT_REFS
+  // Compute how many extra alt_refs we can have
+  cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
+                                                 rc->source_alt_ref_pending);
+  // Currently at maximum two extra ARFs' are allowed
+  assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+#endif  // CONFIG_EXT_REFS
+
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
 #if CONFIG_EXT_REFS
   rc->bipred_group_interval = BFG_INTERVAL;
   // The minimum bi-predictive frame group interval is 2.
-  if (rc->bipred_group_interval < MIN_BFG_INTERVAL)
-    rc->bipred_group_interval = 0;
+  if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
 #endif  // CONFIG_EXT_REFS
 
   // Reset the file position.
@@ -2227,6 +2373,8 @@
   return is_viable_kf;
 }
 
+#define FRAMES_TO_CHECK_DECAY 8
+
 static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int i, j;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2245,7 +2393,7 @@
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
   double kf_group_err = 0.0;
-  double recent_loop_decay[8] = { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
+  double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
 
   av1_zero(next_frame);
 
@@ -2272,6 +2420,9 @@
 
   kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
+  // Initialize the decay rates for the recent frames to check
+  for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
   // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end &&
@@ -2298,9 +2449,10 @@
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
       // quality since the last GF or KF.
-      recent_loop_decay[i % 8] = loop_decay_rate;
+      recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
       decay_accumulator = 1.0;
-      for (j = 0; j < 8; ++j) decay_accumulator *= recent_loop_decay[j];
+      for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+        decay_accumulator *= recent_loop_decay[j];
 
       // Special check for transition or high motion followed by a
       // static scene.
@@ -2462,32 +2614,57 @@
 static void configure_buffer_updates(AV1_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
 
+  // Wei-Ting: Should we define another function to take care of
+  // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
   cpi->rc.is_src_frame_alt_ref = 0;
 #if CONFIG_EXT_REFS
   cpi->rc.is_bwd_ref_frame = 0;
   cpi->rc.is_last_bipred_frame = 0;
   cpi->rc.is_bipred_frame = 0;
+  cpi->rc.is_src_frame_ext_arf = 0;
 #endif  // CONFIG_EXT_REFS
 
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 1;
+    case KF_UPDATE:
 #if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 1;
 #endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
       cpi->refresh_alt_ref_frame = 1;
       break;
-    case LF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 0;
+
+    case LF_UPDATE:
+#if CONFIG_EXT_REFS
+      // If we have extra ALT_REFs, we can use the farthest ALT (ALT0) as
+      // the BWD_REF.
+      if (cpi->num_extra_arfs) {
+        int tmp = cpi->bwd_fb_idx;
+
+        cpi->bwd_fb_idx = cpi->alt_fb_idx;
+        cpi->alt_fb_idx = cpi->arf_map[0];
+        cpi->arf_map[0] = tmp;
+
+        cpi->rc.is_bwd_ref_frame = 1;
+      } else {
+        cpi->rc.is_bwd_ref_frame = 0;
+      }
+#endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+
+    case GF_UPDATE:
 #if CONFIG_EXT_REFS
       cpi->refresh_bwd_ref_frame = 0;
 #endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
       cpi->refresh_alt_ref_frame = 0;
       break;
-    case GF_UPDATE: cpi->refresh_last_frame = 1; cpi->refresh_golden_frame = 1;
-#if CONFIG_EXT_REFS
-      cpi->refresh_bwd_ref_frame = 0;
-#endif  // CONFIG_EXT_REFS
-      cpi->refresh_alt_ref_frame = 0;
-      break;
+
     case OVERLAY_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 1;
@@ -2497,12 +2674,16 @@
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
-    case ARF_UPDATE: cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0;
+
+    case ARF_UPDATE:
 #if CONFIG_EXT_REFS
-      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_bwd_ref_frame = 1;
 #endif  // CONFIG_EXT_REFS
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
       cpi->refresh_alt_ref_frame = 1;
       break;
+
 #if CONFIG_EXT_REFS
     case BRF_UPDATE:
       cpi->refresh_last_frame = 0;
@@ -2510,7 +2691,19 @@
       cpi->refresh_bwd_ref_frame = 1;
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_bwd_ref_frame = 1;
+      if (cpi->num_extra_arfs) {
+        // Allow BRF use the farthest ALT_REF (ALT0) as BWD_REF by swapping
+        // the virtual indices.
+        // NOTE: The indices will be swapped back after this frame is encoded
+        //       (in av1_update_reference_frames()).
+        int tmp = cpi->bwd_fb_idx;
+
+        cpi->bwd_fb_idx = cpi->alt_fb_idx;
+        cpi->alt_fb_idx = cpi->arf_map[0];
+        cpi->arf_map[0] = tmp;
+      }
       break;
+
     case LAST_BIPRED_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
@@ -2518,6 +2711,7 @@
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_last_bipred_frame = 1;
       break;
+
     case BIPRED_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 0;
@@ -2525,7 +2719,17 @@
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_bipred_frame = 1;
       break;
+
+    case INTNL_OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cpi->rc.is_src_frame_ext_arf = 1;
+      break;
 #endif  // CONFIG_EXT_REFS
+
     default: assert(0); break;
   }
 }
@@ -2611,6 +2815,7 @@
     rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
     rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
+
   av1_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame)) return;
 
@@ -2660,6 +2865,7 @@
   }
 
   target_rate = gf_group->bit_allocation[gf_group->index];
+
   if (cpi->common.frame_type == KEY_FRAME)
     target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
   else

diff --git a/av1/encoder/firstpass.h b/av1/encoder/firstpass.h
index d2d929d..db459cc 100644
--- a/av1/encoder/firstpass.h
+++ b/av1/encoder/firstpass.h

@@ -45,7 +45,11 @@
 // NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
 //       number of bi-predictive frames.
 #define BFG_INTERVAL 2
-#define MIN_BFG_INTERVAL 2
+// The maximum number of extra ALT_REF's
+// NOTE: This number cannot be greater than 2 or the reference frame buffer will
+//       overflow.
+#define MAX_EXT_ARFS 2
+#define MIN_EXT_ARF_INTERVAL 4
 #endif  // CONFIG_EXT_REFS
 
 #define VLOW_MOTION_THRESHOLD 950
@@ -82,10 +86,11 @@
   ARF_UPDATE = 3,
   OVERLAY_UPDATE = 4,
 #if CONFIG_EXT_REFS
-  BRF_UPDATE = 5,          // Backward Reference Frame
-  LAST_BIPRED_UPDATE = 6,  // Last Bi-Predictive Frame
-  BIPRED_UPDATE = 7,       // Bi-Predictive Frame, but not the last one
-  FRAME_UPDATE_TYPES = 8
+  BRF_UPDATE = 5,            // Backward Reference Frame
+  LAST_BIPRED_UPDATE = 6,    // Last Bi-predictive Frame
+  BIPRED_UPDATE = 7,         // Bi-predictive Frame, but not the last one
+  INTNL_OVERLAY_UPDATE = 8,  // Internal Overlay Frame
+  FRAME_UPDATE_TYPES = 9
 #else
   FRAME_UPDATE_TYPES = 5
 #endif  // CONFIG_EXT_REFS
@@ -107,7 +112,7 @@
   unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
 #if CONFIG_EXT_REFS
   unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char brf_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
 #endif  // CONFIG_EXT_REFS
   int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
@@ -177,6 +182,19 @@
 void av1_calculate_coded_size(struct AV1_COMP *cpi, int *scaled_frame_width,
                               int *scaled_frame_height);
 
+#if CONFIG_EXT_REFS
+static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
+  if (arf_pending && MAX_EXT_ARFS > 0)
+    return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
+               ? MAX_EXT_ARFS
+               : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS
+                     ? MAX_EXT_ARFS - 1
+                     : 0;
+  else
+    return 0;
+}
+#endif  // CONFIG_EXT_REFS
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/global_motion.c b/av1/encoder/global_motion.c
new file mode 100644
index 0000000..d8abea9
--- /dev/null
+++ b/av1/encoder/global_motion.c

@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/corner_match.h"
+#include "av1/encoder/ransac.h"
+
+#define MAX_CORNERS 4096
+#define MIN_INLIER_PROB 0.1
+
+static INLINE RansacFunc get_ransac_type(TransformationType type) {
+  switch (type) {
+    case HOMOGRAPHY: return ransac_homography;
+    case AFFINE: return ransac_affine;
+    case ROTZOOM: return ransac_rotzoom;
+    case TRANSLATION: return ransac_translation;
+    default: assert(0); return NULL;
+  }
+}
+
+// computes global motion parameters by fitting a model using RANSAC
+static int compute_global_motion_params(TransformationType type,
+                                        double *correspondences,
+                                        int num_correspondences, double *params,
+                                        int *inlier_map) {
+  int result;
+  int num_inliers = 0;
+  RansacFunc ransac = get_ransac_type(type);
+  if (ransac == NULL) return 0;
+
+  result = ransac(correspondences, num_correspondences, &num_inliers,
+                  inlier_map, params);
+  if (!result && num_inliers < MIN_INLIER_PROB * num_correspondences) {
+    result = 1;
+    num_inliers = 0;
+  }
+  return num_inliers;
+}
+
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref,
+                                        double *params) {
+  int num_frm_corners, num_ref_corners;
+  int num_correspondences;
+  double *correspondences;
+  int num_inliers;
+  int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS];
+  int *inlier_map = NULL;
+
+  // compute interest points in images using FAST features
+  num_frm_corners =
+      fast_corner_detect(frm->y_buffer, frm->y_width, frm->y_height,
+                         frm->y_stride, frm_corners, MAX_CORNERS);
+  num_ref_corners =
+      fast_corner_detect(ref->y_buffer, ref->y_width, ref->y_height,
+                         ref->y_stride, ref_corners, MAX_CORNERS);
+
+  // find correspondences between the two images
+  correspondences =
+      (double *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+  num_correspondences = determine_correspondence(
+      frm->y_buffer, (int *)frm_corners, num_frm_corners, ref->y_buffer,
+      (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height,
+      frm->y_stride, ref->y_stride, correspondences);
+
+  inlier_map = (int *)malloc(num_correspondences * sizeof(*inlier_map));
+  num_inliers = compute_global_motion_params(
+      type, correspondences, num_correspondences, params, inlier_map);
+  free(correspondences);
+  free(inlier_map);
+  return (num_inliers > 0);
+}

diff --git a/av1/encoder/global_motion.h b/av1/encoder/global_motion.h
new file mode 100644
index 0000000..fa4f791
--- /dev/null
+++ b/av1/encoder/global_motion.h

@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_ENCODER_GLOBAL_MOTION_H_
+#define AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+  Computes global motion parameters between two frames. The array
+  "params" should be length 9, where the first 2 slots are translation
+  parameters in (row, col) order, and the remaining slots correspond
+  to values in the transformation matrix of the corresponding motion
+  model. They are arranged in "params" such that values on the tx-matrix
+  diagonal have odd numbered indices so the folowing matrix:
+  A | B
+  C | D
+  would produce params = [trans row, trans col, B, A, C, D]
+*/
+int compute_global_motion_feature_based(TransformationType type,
+                                        YV12_BUFFER_CONFIG *frm,
+                                        YV12_BUFFER_CONFIG *ref,
+                                        double *params);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AV1_ENCODER_GLOBAL_MOTION_H_

diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 097857d..a88c884 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c

@@ -16,250 +16,456 @@
 #include "av1/common/idct.h"
 #include "av1/encoder/hybrid_fwd_txfm.h"
 
-void av1_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                      int diff_stride, TX_TYPE tx_type, int lossless) {
-  if (lossless) {
-    av1_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT: aom_fdct4x4(src_diff, coeff, diff_stride); break;
-      case ADST_DCT:
-      case DCT_ADST:
-#if CONFIG_EXT_TX
-      case FLIPADST_DCT:
-      case DCT_FLIPADST:
-      case FLIPADST_FLIPADST:
-      case ADST_FLIPADST:
-      case FLIPADST_ADST:
-      case IDTX:
-      case V_DCT:
-      case H_DCT:
-      case V_ADST:
-      case H_ADST:
-      case V_FLIPADST:
-      case H_FLIPADST:
-#endif  // CONFIG_EXT_TX
-      case ADST_ADST: av1_fht4x4(src_diff, coeff, diff_stride, tx_type); break;
-      default: assert(0); break;
-    }
-  }
+static INLINE void fdct32x32(int rd_transform, const int16_t *src,
+                             tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    aom_fdct32x32_rd(src, dst, src_stride);
+  else
+    av1_fht32x32(src, dst, src_stride, DCT_DCT);
 }
 
-static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type,
-                         FWD_TXFM_OPT tx_opt) {
+#if CONFIG_TX64X64
+static INLINE void fdct64x64(const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
+  av1_fht64x64(src, dst, src_stride, DCT_DCT);
+}
+
+static INLINE void fdct64x64_1(const int16_t *src, tran_low_t *dst,
+                               int src_stride) {
+  int i, j;
+  int32_t sum = 0;
+  memset(dst, 0, sizeof(*dst) * 4096);
+  for (i = 0; i < 64; ++i)
+    for (j = 0; j < 64; ++j) sum += src[i * src_stride + j];
+  // Note: this scaling makes the transform 2 times unitary
+  dst[0] = ROUND_POWER_OF_TWO_SIGNED(sum, 5);
+}
+#endif  // CONFIG_TX64X64
+
+static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
+    case ADST_ADST: av1_fht4x4(src_diff, coeff, diff_stride, tx_type); break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
+    case H_FLIPADST: av1_fht4x4(src_diff, coeff, diff_stride, tx_type); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type); break;
 #endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+}
+
+static void fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht8x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+                          int diff_stride, TX_TYPE tx_type,
+                          FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht16x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht16x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  (void)fwd_txfm_opt;
+  av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
     case ADST_ADST:
-      if (tx_opt == FWD_TXFM_OPT_NORMAL)
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
         av1_fht8x8(src_diff, coeff, diff_stride, tx_type);
-      else
+      else  // FWD_TXFM_OPT_DC
         aom_fdct8x8_1(src_diff, coeff, diff_stride);
       break;
-    default: assert(0); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST: av1_fht8x8(src_diff, coeff, diff_stride, tx_type); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
   }
 }
 
 static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TX_TYPE tx_type,
-                           FWD_TXFM_OPT tx_opt) {
+                           FWD_TXFM_OPT fwd_txfm_opt) {
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
+    case ADST_ADST:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      else  // FWD_TXFM_OPT_DC
+        aom_fdct16x16_1(src_diff, coeff, diff_stride);
+      break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
-    case H_FLIPADST:
+    case H_FLIPADST: av1_fht16x16(src_diff, coeff, diff_stride, tx_type); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type); break;
 #endif  // CONFIG_EXT_TX
-    case ADST_ADST:
-      if (tx_opt == FWD_TXFM_OPT_NORMAL)
-        av1_fht16x16(src_diff, coeff, diff_stride, tx_type);
-      else
-        aom_fdct16x16_1(src_diff, coeff, diff_stride);
-      break;
-    default: assert(0); break;
+    default: assert(0);
   }
 }
 
 static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
                            tran_low_t *coeff, int diff_stride, TX_TYPE tx_type,
-                           FWD_TXFM_OPT tx_opt) {
+                           FWD_TXFM_OPT fwd_txfm_opt) {
   switch (tx_type) {
     case DCT_DCT:
-      if (tx_opt == FWD_TXFM_OPT_NORMAL)
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
         fdct32x32(rd_transform, src_diff, coeff, diff_stride);
-      else
+      else  // FWD_TXFM_OPT_DC
         aom_fdct32x32_1(src_diff, coeff, diff_stride);
       break;
+#if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+
+#if CONFIG_TX64X64
+static void fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        fdct64x64(src_diff, coeff, diff_stride);
+      else  // FWD_TXFM_OPT_DC
+        fdct64x64_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      av1_fht64x64(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST: av1_fht32x32(src_diff, coeff, diff_stride, tx_type); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type, int lossless,
+                                const int bd) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case IDTX:
+      av1_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
+      av1_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type); break;
 #endif  // CONFIG_EXT_TX
-    case ADST_ADST: assert(0); break;
-    default: assert(0); break;
+    default: assert(0);
   }
 }
 
-#if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                             int diff_stride, TX_TYPE tx_type, int lossless) {
-  if (lossless) {
-    assert(tx_type == DCT_DCT);
-    av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT: aom_highbd_fdct4x4(src_diff, coeff, diff_stride); break;
-      case ADST_DCT:
-      case DCT_ADST:
-#if CONFIG_EXT_TX
-      case FLIPADST_DCT:
-      case DCT_FLIPADST:
-      case FLIPADST_FLIPADST:
-      case ADST_FLIPADST:
-      case FLIPADST_ADST:
-      case IDTX:
-      case V_DCT:
-      case H_DCT:
-      case V_ADST:
-      case H_ADST:
-      case V_FLIPADST:
-      case H_FLIPADST:
-#endif  // CONFIG_EXT_TX
-      case ADST_ADST:
-        av1_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
-        break;
-      default: assert(0); break;
-    }
-  }
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht4x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht8x4(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TX_TYPE tx_type,
+                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht8x16(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+                                 int diff_stride, TX_TYPE tx_type,
+                                 FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht16x8(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht16x32(src_diff, coeff, diff_stride, tx_type);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  av1_highbd_fht32x16(src_diff, coeff, diff_stride, tx_type);
 }
 
 static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TX_TYPE tx_type,
-                                FWD_TXFM_OPT tx_opt) {
-  // TODO(sarahparker) try using the tx opt for hbd
-  (void)tx_opt;
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
   switch (tx_type) {
-    case DCT_DCT: aom_highbd_fdct8x8(src_diff, coeff, diff_stride); break;
+    case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
+    case ADST_ADST:
+      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case IDTX:
+      av1_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-#endif  // CONFIG_EXT_TX
-    case ADST_ADST:
-      av1_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      // Use C version since DST exists only in C
+      av1_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    default: assert(0); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
   }
 }
 
 static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
                                   int diff_stride, TX_TYPE tx_type,
-                                  FWD_TXFM_OPT tx_opt) {
-  // TODO(sarahparker) try using the tx opt for hbd
-  (void)tx_opt;
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
   switch (tx_type) {
-    case DCT_DCT: aom_highbd_fdct16x16(src_diff, coeff, diff_stride); break;
+    case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
+    case ADST_ADST:
+      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
 #if CONFIG_EXT_TX
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case IDTX:
+      av1_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
-#endif  // CONFIG_EXT_TX
-    case ADST_ADST:
-      av1_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      // Use C version since DST exists only in C
+      av1_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
       break;
-    default: assert(0); break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
   }
 }
 
 static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
                                   tran_low_t *coeff, int diff_stride,
-                                  TX_TYPE tx_type, FWD_TXFM_OPT tx_opt) {
-  // TODO(sarahparker) try using the tx opt for hbd
-  (void)tx_opt;
+                                  TX_TYPE tx_type, FWD_TXFM_OPT fwd_txfm_opt,
+                                  const int bd) {
+  (void)rd_transform;
+  (void)fwd_txfm_opt;
   switch (tx_type) {
     case DCT_DCT:
-      highbd_fdct32x32(rd_transform, src_diff, coeff, diff_stride);
+      av1_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
       break;
+#if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
-#if CONFIG_EXT_TX
+    case ADST_ADST:
     case FLIPADST_DCT:
     case DCT_FLIPADST:
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-    case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
     case V_FLIPADST:
     case H_FLIPADST:
+      av1_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type); break;
 #endif  // CONFIG_EXT_TX
-    case ADST_ADST: assert(0); break;
     default: assert(0); break;
   }
 }
+
+#if CONFIG_TX64X64
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  (void)bd;
+  switch (tx_type) {
+    case DCT_DCT:
+      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      av1_highbd_fht64x64_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX: av1_fwd_idtx_c(src_diff, coeff, diff_stride, 64, tx_type); break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
@@ -270,6 +476,11 @@
   const int rd_transform = fwd_txfm_param->rd_transform;
   const int lossless = fwd_txfm_param->lossless;
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
                      fwd_txfm_opt);
@@ -280,8 +491,26 @@
     case TX_8X8:
       fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
       break;
+    case TX_4X8:
+      fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X4:
+      fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X16:
+      fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_16X8:
+      fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_16X32:
+      fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_32X16:
+      fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
     case TX_4X4:
-      av1_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+      fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
       break;
     default: assert(0); break;
   }
@@ -295,20 +524,52 @@
   const TX_SIZE tx_size = fwd_txfm_param->tx_size;
   const int rd_transform = fwd_txfm_param->rd_transform;
   const int lossless = fwd_txfm_param->lossless;
+  const int bd = fwd_txfm_param->bd;
   switch (tx_size) {
+#if CONFIG_TX64X64
+    case TX_64X64:
+      highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+#endif  // CONFIG_TX64X64
     case TX_32X32:
       highbd_fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
-                            fwd_txfm_opt);
+                            fwd_txfm_opt, bd);
       break;
     case TX_16X16:
-      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type,
-                            fwd_txfm_opt);
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
       break;
     case TX_8X8:
-      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                          bd);
+      break;
+    case TX_4X8:
+      highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                          bd);
+      break;
+    case TX_8X4:
+      highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                          bd);
+      break;
+    case TX_8X16:
+      highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                           bd);
+      break;
+    case TX_16X8:
+      highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                           bd);
+      break;
+    case TX_16X32:
+      highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
+      break;
+    case TX_32X16:
+      highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt,
+                            bd);
       break;
     case TX_4X4:
-      av1_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless, bd);
       break;
     default: assert(0); break;
   }

diff --git a/av1/encoder/hybrid_fwd_txfm.h b/av1/encoder/hybrid_fwd_txfm.h
index 9cd8c11..2a5959e 100644
--- a/av1/encoder/hybrid_fwd_txfm.h
+++ b/av1/encoder/hybrid_fwd_txfm.h

@@ -22,46 +22,25 @@
   FWD_TXFM_OPT fwd_txfm_opt;
   int rd_transform;
   int lossless;
+#if CONFIG_AOM_HIGHBITDEPTH
+  int bd;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 } FWD_TXFM_PARAM;
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void av1_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                      int diff_stride, TX_TYPE tx_type, int lossless);
-
 void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
               FWD_TXFM_PARAM *fwd_txfm_param);
 
 #if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                             int diff_stride, TX_TYPE tx_type, int lossless);
-
 void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
                      int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-static INLINE void fdct32x32(int rd_transform, const int16_t *src,
-                             tran_low_t *dst, int src_stride) {
-  if (rd_transform)
-    aom_fdct32x32_rd(src, dst, src_stride);
-  else
-    aom_fdct32x32(src, dst, src_stride);
-}
-
-#if CONFIG_AOM_HIGHBITDEPTH
-static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
-                                    tran_low_t *dst, int src_stride) {
-  if (rd_transform)
-    aom_highbd_fdct32x32_rd(src, dst, src_stride);
-  else
-    aom_highbd_fdct32x32(src, dst, src_stride);
-}
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // AV1_ENCODER_ENCODEMB_H_
+#endif  // AV1_ENCODER_HYBRID_FWD_TXFM_H_

diff --git a/av1/encoder/lookahead.c b/av1/encoder/lookahead.c
index 21f56eb..6b4500b 100644
--- a/av1/encoder/lookahead.c
+++ b/av1/encoder/lookahead.c

@@ -20,9 +20,8 @@
 #include "av1/encoder/lookahead.h"
 
 /* Return the buffer at the given absolute index and increment the index */
-static struct lookahead_entry *pop(struct lookahead_ctx *ctx,
-                                   unsigned int *idx) {
-  unsigned int index = *idx;
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+  int index = *idx;
   struct lookahead_entry *buf = ctx->buf + index;
 
   assert(index < ctx->max_sz);
@@ -34,7 +33,7 @@
 void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
   if (ctx) {
     if (ctx->buf) {
-      unsigned int i;
+      int i;
 
       for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
       free(ctx->buf);
@@ -206,9 +205,9 @@
 
   if (index >= 0) {
     // Forward peek
-    if (index < (int)ctx->sz) {
+    if (index < ctx->sz) {
       index += ctx->read_idx;
-      if (index >= (int)ctx->max_sz) index -= ctx->max_sz;
+      if (index >= ctx->max_sz) index -= ctx->max_sz;
       buf = ctx->buf + index;
     }
   } else if (index < 0) {

diff --git a/av1/encoder/lookahead.h b/av1/encoder/lookahead.h
index 4f2803c..8ffc58b 100644
--- a/av1/encoder/lookahead.h
+++ b/av1/encoder/lookahead.h

@@ -32,10 +32,10 @@
 #define MAX_PRE_FRAMES 1
 
 struct lookahead_ctx {
-  unsigned int max_sz;         /* Absolute size of the queue */
-  unsigned int sz;             /* Number of buffers currently in the queue */
-  unsigned int read_idx;       /* Read index */
-  unsigned int write_idx;      /* Write index */
+  int max_sz;                  /* Absolute size of the queue */
+  int sz;                      /* Number of buffers currently in the queue */
+  int read_idx;                /* Read index */
+  int write_idx;               /* Write index */
   struct lookahead_entry *buf; /* Buffer list */
 };
 

diff --git a/av1/encoder/mbgraph.c b/av1/encoder/mbgraph.c
index 550917d..1fd1682 100644
--- a/av1/encoder/mbgraph.c
+++ b/av1/encoder/mbgraph.c

@@ -24,8 +24,7 @@
 #include "av1/common/reconintra.h"
 
 static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
-                                              MV *dst_mv, int mb_row,
-                                              int mb_col) {
+                                              int mb_row, int mb_col) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
@@ -49,22 +48,31 @@
 
   /*cpi->sf.search_method == HEX*/
   av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
-                 cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv, dst_mv);
+                 cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
   {
     int distortion;
     unsigned int sse;
-    cpi->find_fractional_mv_step(
-        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
-        cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0,
-        0, 0);
+    cpi->find_fractional_mv_step(x, ref_mv, cpi->common.allow_high_precision_mv,
+                                 x->errorperbit, &v_fn_ptr, 0,
+                                 mv_sf->subpel_iters_per_step,
+                                 cond_cost_list(cpi, cost_list), NULL, NULL,
+                                 &distortion, &sse, NULL, 0, 0, 0);
   }
 
-  xd->mi[0]->mbmi.mode = NEWMV;
-  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
+#if CONFIG_EXT_INTER
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    xd->mi[0]->mbmi.mode = NEW_NEWMV;
+  else
+#endif  // CONFIG_EXT_INTER
+    xd->mi[0]->mbmi.mode = NEWMV;
+
+  xd->mi[0]->mbmi.mv[0] = x->best_mv;
+#if CONFIG_EXT_INTER
+  xd->mi[0]->mbmi.ref_frame[1] = NONE;
+#endif  // CONFIG_EXT_INTER
 
   av1_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
 
@@ -78,39 +86,40 @@
                       xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
 
-static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv,
-                                  int_mv *dst_mv, int mb_row, int mb_col) {
+static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
+                                  int mb_col) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
-  MV tmp_mv;
+  MV best_mv;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                      xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
-  dst_mv->as_int = 0;
+  best_mv.col = best_mv.row = 0;
 
   // Test last reference frame using the previous best mv as the
   // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
   if (tmp_err < err) {
     err = tmp_err;
-    dst_mv->as_mv = tmp_mv;
+    best_mv = x->best_mv.as_mv;
   }
 
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
     MV zero_ref_mv = { 0, 0 };
-    tmp_err =
-        do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col);
+
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
     if (tmp_err < err) {
-      dst_mv->as_mv = tmp_mv;
       err = tmp_err;
+      best_mv = x->best_mv.as_mv;
     }
   }
 
+  x->best_mv.as_mv = best_mv;
   return err;
 }
 
@@ -140,7 +149,7 @@
     unsigned int err;
 
     xd->mi[0]->mbmi.mode = mode;
-    av1_predict_intra_block(xd, 2, 2, TX_16X16, mode, x->plane[0].src.buf,
+    av1_predict_intra_block(xd, 16, 16, TX_16X16, mode, x->plane[0].src.buf,
                             x->plane[0].src.stride, xd->plane[0].dst.buf,
                             xd->plane[0].dst.stride, 0, 0, 0);
     err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
@@ -187,8 +196,8 @@
     xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
     xd->plane[0].pre[0].stride = golden_ref->y_stride;
     g_motion_error =
-        do_16x16_motion_search(cpi, prev_golden_ref_mv,
-                               &stats->ref[GOLDEN_FRAME].m.mv, mb_row, mb_col);
+        do_16x16_motion_search(cpi, prev_golden_ref_mv, mb_row, mb_col);
+    stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
     stats->ref[GOLDEN_FRAME].err = INT_MAX;

diff --git a/av1/encoder/mbgraph.h b/av1/encoder/mbgraph.h
index db005e1..758e2ad 100644
--- a/av1/encoder/mbgraph.h
+++ b/av1/encoder/mbgraph.h

@@ -23,7 +23,7 @@
       int_mv mv;
       PREDICTION_MODE mode;
     } m;
-  } ref[MAX_REF_FRAMES];
+  } ref[TOTAL_REFS_PER_FRAME];
 } MBGRAPH_MB_STATS;
 
 typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;

diff --git a/av1/encoder/mcomp.c b/av1/encoder/mcomp.c
index cd5a3c4..4b54a2c 100644
--- a/av1/encoder/mcomp.c
+++ b/av1/encoder/mcomp.c

@@ -25,6 +25,7 @@
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
 
 // #define NEW_DIAMOND_SEARCH
 
@@ -92,17 +93,10 @@
 
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
                           int sad_per_bit) {
-#if CONFIG_REF_MV
-  const MV diff = { (mv->row - ref->row) << 3, (mv->col - ref->col) << 3 };
+  const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 };
   return ROUND_POWER_OF_TWO(
       (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->mvsadcost) * sad_per_bit,
       AV1_PROB_COST_SHIFT);
-#else
-  const MV diff = { mv->row - ref->row, mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      AV1_PROB_COST_SHIFT);
-#endif
 }
 
 void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
@@ -159,17 +153,6 @@
  * could reduce the area.
  */
 
-/* Estimated (square) error cost of a motion vector (r,c). The 14 scale comes
- * from the same math as in mv_err_cost(). */
-#define MVC(r, c)                                                 \
-  (mvcost                                                         \
-       ? ((unsigned)(mvjcost[((r) != rr) * 2 + ((c) != rc)] +     \
-                     mvcost[0][((r)-rr)] + mvcost[1][((c)-rc)]) * \
-              error_per_bit +                                     \
-          8192) >>                                                \
-             14                                                   \
-       : 0)
-
 // convert motion vector component to offset for sv[a]f calc
 static INLINE int sp(int x) { return x & 7; }
 
@@ -178,23 +161,26 @@
 }
 
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    if (second_pred == NULL)                                                 \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    else                                                                     \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    if ((v = MVC(r, c) + thismse) < besterr) {                               \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
+#define CHECK_BETTER(v, r, c)                                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                 \
+    MV this_mv = { r, c };                                                \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);    \
+    if (second_pred == NULL)                                              \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r),  \
+                         src_address, src_stride, &sse);                  \
+    else                                                                  \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+                          src_address, src_stride, &sse, second_pred);    \
+    v += thismse;                                                         \
+    if (v < besterr) {                                                    \
+      besterr = v;                                                        \
+      br = r;                                                             \
+      bc = c;                                                             \
+      *distortion = thismse;                                              \
+      *sse1 = sse;                                                        \
+    }                                                                     \
+  } else {                                                                \
+    v = INT_MAX;                                                          \
   }
 
 #define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
@@ -205,20 +191,23 @@
 }
 
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                     \
-    thismse =                                                                 \
-        upsampled_pref_error(xd, vfp, z, src_stride, upre(y, y_stride, r, c), \
-                             y_stride, second_pred, w, h, &sse);              \
-    if ((v = MVC(r, c) + thismse) < besterr) {                                \
-      besterr = v;                                                            \
-      br = r;                                                                 \
-      bc = c;                                                                 \
-      *distortion = thismse;                                                  \
-      *sse1 = sse;                                                            \
-    }                                                                         \
-  } else {                                                                    \
-    v = INT_MAX;                                                              \
+#define CHECK_BETTER1(v, r, c)                                         \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    MV this_mv = { r, c };                                             \
+    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,   \
+                                   upre(y, y_stride, r, c), y_stride,  \
+                                   second_pred, w, h, &sse);           \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+    v += thismse;                                                      \
+    if (v < besterr) {                                                 \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
   }
 
 #define FIRST_LEVEL_CHECKS                                       \
@@ -291,13 +280,14 @@
   }
 
 #define SETUP_SUBPEL_SEARCH                                         \
-  const uint8_t *const z = x->plane[0].src.buf;                     \
+  const uint8_t *const src_address = x->plane[0].src.buf;           \
   const int src_stride = x->plane[0].src.stride;                    \
   const MACROBLOCKD *xd = &x->e_mbd;                                \
   unsigned int besterr = INT_MAX;                                   \
   unsigned int sse;                                                 \
   unsigned int whichdir;                                            \
   int thismse;                                                      \
+  MV *bestmv = &x->best_mv.as_mv;                                   \
   const unsigned int halfiters = iters_per_step;                    \
   const unsigned int quarteriters = iters_per_step;                 \
   const unsigned int eighthiters = iters_per_step;                  \
@@ -305,8 +295,6 @@
   const int offset = bestmv->row * y_stride + bestmv->col;          \
   const uint8_t *const y = xd->plane[0].pre[0].buf;                 \
                                                                     \
-  int rr = ref_mv->row;                                             \
-  int rc = ref_mv->col;                                             \
   int br = bestmv->row * 8;                                         \
   int bc = bestmv->col * 8;                                         \
   int hstep = 4;                                                    \
@@ -330,13 +318,13 @@
 #if CONFIG_AOM_HIGHBITDEPTH
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
       aom_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
                                y_stride);
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
       aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -348,7 +336,7 @@
 #else
   (void)xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
     aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -385,15 +373,15 @@
 }
 
 int av1_find_best_sub_pixel_tree_pruned_evenmore(
-    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
-    int h, int use_upsampled_ref) {
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+    int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
-                               src_stride, y, y_stride, second_pred, w, h,
-                               offset, mvjcost, mvcost, sse1, distortion);
+  besterr = setup_center_error(
+      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
   (void)halfiters;
   (void)quarteriters;
   (void)eighthiters;
@@ -455,17 +443,17 @@
 }
 
 int av1_find_best_sub_pixel_tree_pruned_more(
-    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
-    int h, int use_upsampled_ref) {
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+    int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void)use_upsampled_ref;
 
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
-                               src_stride, y, y_stride, second_pred, w, h,
-                               offset, mvjcost, mvcost, sse1, distortion);
+  besterr = setup_center_error(
+      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
@@ -521,17 +509,17 @@
 }
 
 int av1_find_best_sub_pixel_tree_pruned(
-    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
-    int h, int use_upsampled_ref) {
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, const uint8_t *second_pred, int w, int h,
+    int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   (void)use_upsampled_ref;
 
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
-                               src_stride, y, y_stride, second_pred, w, h,
-                               offset, mvjcost, mvcost, sse1, distortion);
+  besterr = setup_center_error(
+      xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+      y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX) {
@@ -608,11 +596,14 @@
   return besterr;
 }
 
+/* clang-format off */
 static const MV search_step_table[12] = {
   // left, right, up, down
-  { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 }, { 0, -2 }, { 0, 2 },
-  { -2, 0 }, { 2, 0 }, { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
+  { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
+  { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
+  { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
 };
+/* clang-format on */
 
 static int upsampled_pref_error(const MACROBLOCKD *xd,
                                 const aom_variance_fn_ptr_t *vfp,
@@ -623,7 +614,7 @@
   unsigned int besterr;
 #if CONFIG_AOM_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
     if (second_pred != NULL)
       aom_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
                                          y_stride);
@@ -632,9 +623,9 @@
 
     besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, sse);
   } else {
-    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
 #else
-  DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
   (void)xd;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
     if (second_pred != NULL)
@@ -662,25 +653,25 @@
   return besterr;
 }
 
-int av1_find_best_sub_pixel_tree(
-    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
-    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
-    int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
-    int h, int use_upsampled_ref) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const uint8_t *const src_address = z;
+int av1_find_best_sub_pixel_tree(MACROBLOCK *x, const MV *ref_mv, int allow_hp,
+                                 int error_per_bit,
+                                 const aom_variance_fn_ptr_t *vfp,
+                                 int forced_stop, int iters_per_step,
+                                 int *cost_list, int *mvjcost, int *mvcost[2],
+                                 int *distortion, unsigned int *sse1,
+                                 const uint8_t *second_pred, int w, int h,
+                                 int use_upsampled_ref) {
+  const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
   unsigned int thismse;
   const int y_stride = xd->plane[0].pre[0].stride;
+  MV *bestmv = &x->best_mv.as_mv;
   const int offset = bestmv->row * y_stride + bestmv->col;
   const uint8_t *const y = xd->plane[0].pre[0].buf;
 
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
   int br = bestmv->row * 8;
   int bc = bestmv->col * 8;
   int hstep = 4;
@@ -705,12 +696,13 @@
   // use_upsampled_ref can be 0 or 1
   if (use_upsampled_ref)
     besterr = upsampled_setup_center_error(
-        xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride,
-        second_pred, w, h, offset * 8, mvjcost, mvcost, sse1, distortion);
+        xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+        y_stride, second_pred, w, h, (offset * 8), mvjcost, mvcost, sse1,
+        distortion);
   else
-    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
-                                 src_stride, y, y_stride, second_pred, w, h,
-                                 offset, mvjcost, mvcost, sse1, distortion);
+    besterr = setup_center_error(
+        xd, bestmv, ref_mv, error_per_bit, vfp, src_address, src_stride, y,
+        y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion);
 
   (void)cost_list;  // to silence compiler warning
 
@@ -808,17 +800,11 @@
       }
     }
 
-    tr = br;
-    tc = bc;
-
     search_step += 4;
     hstep >>= 1;
     best_idx = -1;
   }
 
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-
   // These lines insure static analysis doesn't warn that
   // tr and tc aren't used after the above point.
   (void)tr;
@@ -834,7 +820,6 @@
   return besterr;
 }
 
-#undef MVC
 #undef PRE
 #undef CHECK_BETTER
 
@@ -866,16 +851,16 @@
 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
 
 // Calculate and return a sad+mvcost list around an integer best pel.
-static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
-                                      int sadpb,
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+                                      const MV *const ref_mv, int sadpb,
                                       const aom_variance_fn_ptr_t *fn_ptr,
                                       const MV *best_mv, int *cost_list) {
   static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
   const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
-  int br = best_mv->row;
-  int bc = best_mv->col;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
   int i;
   unsigned int sse;
   const MV this_mv = { br, bc };
@@ -909,18 +894,63 @@
   }
 }
 
+static INLINE void calc_int_sad_list(const MACROBLOCK *x,
+                                     const MV *const ref_mv, int sadpb,
+                                     const aom_variance_fn_ptr_t *fn_ptr,
+                                     const MV *best_mv, int *cost_list,
+                                     const int use_mvcost, const int bestsad) {
+  static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  int i;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+
+  if (cost_list[0] == INT_MAX) {
+    cost_list[0] = bestsad;
+    if (check_bounds(x, br, bc, 1)) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        cost_list[i + 1] =
+            fn_ptr->sdf(what->buf, what->stride,
+                        get_buf_from_mv(in_what, &this_mv), in_what->stride);
+      }
+    } else {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        if (!is_mv_in(x, &this_mv))
+          cost_list[i + 1] = INT_MAX;
+        else
+          cost_list[i + 1] =
+              fn_ptr->sdf(what->buf, what->stride,
+                          get_buf_from_mv(in_what, &this_mv), in_what->stride);
+      }
+    }
+  } else {
+    if (use_mvcost) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+        if (cost_list[i + 1] != INT_MAX) {
+          cost_list[i + 1] += mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+        }
+      }
+    }
+  }
+}
+
 // Generic pattern search function that searches over multiple scales.
 // Each scale can have a different number of candidates and shape of
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 //
-static int av1_pattern_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
-                              int sad_per_bit, int do_init_search,
-                              int *cost_list, const aom_variance_fn_ptr_t *vfp,
-                              int use_mvcost, const MV *center_mv, MV *best_mv,
-                              const int num_candidates[MAX_PATTERN_SCALES],
-                              const MV candidates[MAX_PATTERN_SCALES]
-                                                 [MAX_PATTERN_CANDIDATES]) {
+static int pattern_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                          int sad_per_bit, int do_init_search, int *cost_list,
+                          const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                          const MV *center_mv,
+                          const int num_candidates[MAX_PATTERN_SCALES],
+                          const MV candidates[MAX_PATTERN_SCALES]
+                                             [MAX_PATTERN_CANDIDATES]) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
@@ -928,6 +958,7 @@
   int i, s, t;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int last_is_4 = num_candidates[0] == 4;
   int br, bc;
   int bestsad = INT_MAX;
   int thissad;
@@ -935,188 +966,19 @@
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
   int best_init_s = search_param_to_steps[search_param];
   // adjust ref_mv to make sure it is within MV range
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->row;
-  bc = ref_mv->col;
-
-  // Work out the start point for the search
-  bestsad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                     in_what->stride) +
-            mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-
-  // Search all possible scales upto the search param around the center point
-  // pick the scale of the point that is best as the starting scale of
-  // further steps around it.
-  if (do_init_search) {
-    s = best_init_s;
-    best_init_s = -1;
-    for (t = 0; t <= s; ++t) {
-      int best_site = -1;
-      if (check_bounds(x, br, bc, 1 << t)) {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const MV this_mv = { br + candidates[t][i].row,
-                               bc + candidates[t][i].col };
-          thissad =
-              vfp->sdf(what->buf, what->stride,
-                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
-          CHECK_BETTER
-        }
-      } else {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const MV this_mv = { br + candidates[t][i].row,
-                               bc + candidates[t][i].col };
-          if (!is_mv_in(x, &this_mv)) continue;
-          thissad =
-              vfp->sdf(what->buf, what->stride,
-                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
-          CHECK_BETTER
-        }
-      }
-      if (best_site == -1) {
-        continue;
-      } else {
-        best_init_s = t;
-        k = best_site;
-      }
-    }
-    if (best_init_s != -1) {
-      br += candidates[best_init_s][k].row;
-      bc += candidates[best_init_s][k].col;
-    }
-  }
-
-  // If the center point is still the best, just skip this and move to
-  // the refinement step.
-  if (best_init_s != -1) {
-    int best_site = -1;
-    s = best_init_s;
-
-    do {
-      // No need to search all 6 points the 1st time if initial search was used
-      if (!do_init_search || s != best_init_s) {
-        if (check_bounds(x, br, bc, 1 << s)) {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = { br + candidates[s][i].row,
-                                 bc + candidates[s][i].col };
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
-            CHECK_BETTER
-          }
-        } else {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = { br + candidates[s][i].row,
-                                 bc + candidates[s][i].col };
-            if (!is_mv_in(x, &this_mv)) continue;
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
-            CHECK_BETTER
-          }
-        }
-
-        if (best_site == -1) {
-          continue;
-        } else {
-          br += candidates[s][best_site].row;
-          bc += candidates[s][best_site].col;
-          k = best_site;
-        }
-      }
-
-      do {
-        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
-        best_site = -1;
-        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
-        next_chkpts_indices[1] = k;
-        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
-
-        if (check_bounds(x, br, bc, 1 << s)) {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
-            };
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
-            CHECK_BETTER
-          }
-        } else {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {
-              br + candidates[s][next_chkpts_indices[i]].row,
-              bc + candidates[s][next_chkpts_indices[i]].col
-            };
-            if (!is_mv_in(x, &this_mv)) continue;
-            thissad =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
-            CHECK_BETTER
-          }
-        }
-
-        if (best_site != -1) {
-          k = next_chkpts_indices[best_site];
-          br += candidates[s][k].row;
-          bc += candidates[s][k].col;
-        }
-      } while (best_site != -1);
-    } while (s--);
-  }
-
-  // Returns the one-away integer pel sad values around the best as follows:
-  // cost_list[0]: cost at the best integer pel
-  // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
-  // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
-  // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
-  // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
-  if (cost_list) {
-    const MV best_int_mv = { br, bc };
-    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_int_mv,
-                       cost_list);
-  }
-  best_mv->row = br;
-  best_mv->col = bc;
-  return bestsad;
-}
-
-// A specialized function where the smallest scale search candidates
-// are 4 1-away neighbors, and cost_list is non-null
-// TODO(debargha): Merge this function with the one above. Also remove
-// use_mvcost option since it is always 1, to save unnecessary branches.
-static int av1_pattern_search_sad(
-    const MACROBLOCK *x, MV *ref_mv, int search_param, int sad_per_bit,
-    int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp,
-    int use_mvcost, const MV *center_mv, MV *best_mv,
-    const int num_candidates[MAX_PATTERN_SCALES],
-    const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
-    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-  };
-  int i, s, t;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  int br, bc;
-  int bestsad = INT_MAX;
-  int thissad;
-  int k = -1;
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  int best_init_s = search_param_to_steps[search_param];
-  // adjust ref_mv to make sure it is within MV range
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->row;
-  bc = ref_mv->col;
+  clamp_mv(start_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min,
+           x->mv_row_max);
+  br = start_mv->row;
+  bc = start_mv->col;
   if (cost_list != NULL) {
     cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
         INT_MAX;
   }
 
   // Work out the start point for the search
-  bestsad = vfp->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                     in_what->stride) +
-            mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  bestsad = vfp->sdf(what->buf, what->stride,
+                     get_buf_from_mv(in_what, start_mv), in_what->stride) +
+            mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -1162,11 +1024,12 @@
   // If the center point is still the best, just skip this and move to
   // the refinement step.
   if (best_init_s != -1) {
-    int do_sad = (num_candidates[0] == 4 && cost_list != NULL);
+    const int last_s = (last_is_4 && cost_list != NULL);
     int best_site = -1;
     s = best_init_s;
 
-    for (; s >= do_sad; s--) {
+    for (; s >= last_s; s--) {
+      // No need to search all points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
         if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
@@ -1316,48 +1179,24 @@
     }
   }
 
-  // Returns the one-away integer pel sad values around the best as follows:
-  // cost_list[0]: sad at the best integer pel
-  // cost_list[1]: sad at delta {0, -1} (left)   from the best integer pel
-  // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
-  // cost_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
-  // cost_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
+  // Returns the one-away integer pel cost/sad around the best as follows:
+  // cost_list[0]: cost/sad at the best integer pel
+  // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
   if (cost_list) {
-    static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
-    if (cost_list[0] == INT_MAX) {
-      cost_list[0] = bestsad;
-      if (check_bounds(x, br, bc, 1)) {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-          cost_list[i + 1] =
-              vfp->sdf(what->buf, what->stride,
-                       get_buf_from_mv(in_what, &this_mv), in_what->stride);
-        }
-      } else {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-          if (!is_mv_in(x, &this_mv))
-            cost_list[i + 1] = INT_MAX;
-          else
-            cost_list[i + 1] =
-                vfp->sdf(what->buf, what->stride,
-                         get_buf_from_mv(in_what, &this_mv), in_what->stride);
-        }
-      }
+    const MV best_int_mv = { br, bc };
+    if (last_is_4) {
+      calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, cost_list,
+                        use_mvcost, bestsad);
     } else {
-      if (use_mvcost) {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
-          if (cost_list[i + 1] != INT_MAX) {
-            cost_list[i + 1] +=
-                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-          }
-        }
-      }
+      calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_int_mv,
+                         cost_list);
     }
   }
-  best_mv->row = br;
-  best_mv->col = bc;
+  x->best_mv.as_mv.row = br;
+  x->best_mv.as_mv.col = bc;
   return bestsad;
 }
 
@@ -1393,298 +1232,148 @@
                      : 0);
 }
 
-int av1_hex_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
                    int sad_per_bit, int do_init_search, int *cost_list,
                    const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                   const MV *center_mv, MV *best_mv) {
+                   const MV *center_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
                                                               6, 6, 6, 6, 6 };
   // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
   static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
-    { { -1, -1 },
-      { 0, -1 },
-      { 1, -1 },
-      { 1, 0 },
-      { 1, 1 },
-      { 0, 1 },
-      { -1, 1 },
+    { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
       { -1, 0 } },
     { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
     { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
     { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
     { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
-    { { -16, -32 },
-      { 16, -32 },
-      { 32, 0 },
-      { 16, 32 },
-      { -16, 32 },
+    { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
       { -32, 0 } },
-    { { -32, -64 },
-      { 32, -64 },
-      { 64, 0 },
-      { 32, 64 },
-      { -32, 64 },
+    { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
       { -64, 0 } },
-    { { -64, -128 },
-      { 64, -128 },
-      { 128, 0 },
-      { 64, 128 },
-      { -64, 128 },
+    { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
       { -128, 0 } },
-    { { -128, -256 },
-      { 128, -256 },
-      { 256, 0 },
-      { 128, 256 },
-      { -128, 256 },
+    { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
       { -256, 0 } },
-    { { -256, -512 },
-      { 256, -512 },
-      { 512, 0 },
-      { 256, 512 },
-      { -256, 512 },
+    { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
       { -512, 0 } },
-    { { -512, -1024 },
-      { 512, -1024 },
-      { 1024, 0 },
-      { 512, 1024 },
-      { -512, 1024 },
-      { -1024, 0 } },
+    { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+      { -512, 1024 }, { -1024, 0 } },
   };
-  return av1_pattern_search(
-      x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp,
-      use_mvcost, center_mv, best_mv, hex_num_candidates, hex_candidates);
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        hex_num_candidates, hex_candidates);
 }
 
-int av1_bigdia_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
-                      int sad_per_bit, int do_init_search, int *cost_list,
-                      const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                      const MV *center_mv, MV *best_mv) {
+static int bigdia_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                         int sad_per_bit, int do_init_search, int *cost_list,
+                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                         const MV *center_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
     4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
   };
   // Note that the largest candidate step at each scale is 2^scale
+  /* clang-format off */
   static const MV
       bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
         { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
-        { { -1, -1 },
-          { 0, -2 },
-          { 1, -1 },
-          { 2, 0 },
-          { 1, 1 },
-          { 0, 2 },
-          { -1, 1 },
-          { -2, 0 } },
-        { { -2, -2 },
-          { 0, -4 },
-          { 2, -2 },
-          { 4, 0 },
-          { 2, 2 },
-          { 0, 4 },
-          { -2, 2 },
-          { -4, 0 } },
-        { { -4, -4 },
-          { 0, -8 },
-          { 4, -4 },
-          { 8, 0 },
-          { 4, 4 },
-          { 0, 8 },
-          { -4, 4 },
-          { -8, 0 } },
-        { { -8, -8 },
-          { 0, -16 },
-          { 8, -8 },
-          { 16, 0 },
-          { 8, 8 },
-          { 0, 16 },
-          { -8, 8 },
-          { -16, 0 } },
-        { { -16, -16 },
-          { 0, -32 },
-          { 16, -16 },
-          { 32, 0 },
-          { 16, 16 },
-          { 0, 32 },
-          { -16, 16 },
-          { -32, 0 } },
-        { { -32, -32 },
-          { 0, -64 },
-          { 32, -32 },
-          { 64, 0 },
-          { 32, 32 },
-          { 0, 64 },
-          { -32, 32 },
-          { -64, 0 } },
-        { { -64, -64 },
-          { 0, -128 },
-          { 64, -64 },
-          { 128, 0 },
-          { 64, 64 },
-          { 0, 128 },
-          { -64, 64 },
-          { -128, 0 } },
-        { { -128, -128 },
-          { 0, -256 },
-          { 128, -128 },
-          { 256, 0 },
-          { 128, 128 },
-          { 0, 256 },
-          { -128, 128 },
-          { -256, 0 } },
-        { { -256, -256 },
-          { 0, -512 },
-          { 256, -256 },
-          { 512, 0 },
-          { 256, 256 },
-          { 0, 512 },
-          { -256, 256 },
-          { -512, 0 } },
-        { { -512, -512 },
-          { 0, -1024 },
-          { 512, -512 },
-          { 1024, 0 },
-          { 512, 512 },
-          { 0, 1024 },
-          { -512, 512 },
-          { -1024, 0 } },
+        { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+          { -1, 1 }, { -2, 0 } },
+        { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+          { -2, 2 }, { -4, 0 } },
+        { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+          { -4, 4 }, { -8, 0 } },
+        { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+          { -8, 8 }, { -16, 0 } },
+        { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+          { 0, 32 }, { -16, 16 }, { -32, 0 } },
+        { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+          { 0, 64 }, { -32, 32 }, { -64, 0 } },
+        { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+          { 0, 128 }, { -64, 64 }, { -128, 0 } },
+        { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
+          { 0, 256 }, { -128, 128 }, { -256, 0 } },
+        { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
+          { 0, 512 }, { -256, 256 }, { -512, 0 } },
+        { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+          { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
       };
-  return av1_pattern_search_sad(
-      x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp,
-      use_mvcost, center_mv, best_mv, bigdia_num_candidates, bigdia_candidates);
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        bigdia_num_candidates, bigdia_candidates);
 }
 
-int av1_square_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
-                      int sad_per_bit, int do_init_search, int *cost_list,
-                      const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                      const MV *center_mv, MV *best_mv) {
+static int square_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                         int sad_per_bit, int do_init_search, int *cost_list,
+                         const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                         const MV *center_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
     8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
   };
   // Note that the largest candidate step at each scale is 2^scale
-  static const MV square_candidates[MAX_PATTERN_SCALES]
-                                   [MAX_PATTERN_CANDIDATES] = {
-                                     { { -1, -1 },
-                                       { 0, -1 },
-                                       { 1, -1 },
-                                       { 1, 0 },
-                                       { 1, 1 },
-                                       { 0, 1 },
-                                       { -1, 1 },
-                                       { -1, 0 } },
-                                     { { -2, -2 },
-                                       { 0, -2 },
-                                       { 2, -2 },
-                                       { 2, 0 },
-                                       { 2, 2 },
-                                       { 0, 2 },
-                                       { -2, 2 },
-                                       { -2, 0 } },
-                                     { { -4, -4 },
-                                       { 0, -4 },
-                                       { 4, -4 },
-                                       { 4, 0 },
-                                       { 4, 4 },
-                                       { 0, 4 },
-                                       { -4, 4 },
-                                       { -4, 0 } },
-                                     { { -8, -8 },
-                                       { 0, -8 },
-                                       { 8, -8 },
-                                       { 8, 0 },
-                                       { 8, 8 },
-                                       { 0, 8 },
-                                       { -8, 8 },
-                                       { -8, 0 } },
-                                     { { -16, -16 },
-                                       { 0, -16 },
-                                       { 16, -16 },
-                                       { 16, 0 },
-                                       { 16, 16 },
-                                       { 0, 16 },
-                                       { -16, 16 },
-                                       { -16, 0 } },
-                                     { { -32, -32 },
-                                       { 0, -32 },
-                                       { 32, -32 },
-                                       { 32, 0 },
-                                       { 32, 32 },
-                                       { 0, 32 },
-                                       { -32, 32 },
-                                       { -32, 0 } },
-                                     { { -64, -64 },
-                                       { 0, -64 },
-                                       { 64, -64 },
-                                       { 64, 0 },
-                                       { 64, 64 },
-                                       { 0, 64 },
-                                       { -64, 64 },
-                                       { -64, 0 } },
-                                     { { -128, -128 },
-                                       { 0, -128 },
-                                       { 128, -128 },
-                                       { 128, 0 },
-                                       { 128, 128 },
-                                       { 0, 128 },
-                                       { -128, 128 },
-                                       { -128, 0 } },
-                                     { { -256, -256 },
-                                       { 0, -256 },
-                                       { 256, -256 },
-                                       { 256, 0 },
-                                       { 256, 256 },
-                                       { 0, 256 },
-                                       { -256, 256 },
-                                       { -256, 0 } },
-                                     { { -512, -512 },
-                                       { 0, -512 },
-                                       { 512, -512 },
-                                       { 512, 0 },
-                                       { 512, 512 },
-                                       { 0, 512 },
-                                       { -512, 512 },
-                                       { -512, 0 } },
-                                     { { -1024, -1024 },
-                                       { 0, -1024 },
-                                       { 1024, -1024 },
-                                       { 1024, 0 },
-                                       { 1024, 1024 },
-                                       { 0, 1024 },
-                                       { -1024, 1024 },
-                                       { -1024, 0 } },
-                                   };
-  return av1_pattern_search(
-      x, ref_mv, search_param, sad_per_bit, do_init_search, cost_list, vfp,
-      use_mvcost, center_mv, best_mv, square_num_candidates, square_candidates);
+  /* clang-format off */
+  static const MV
+      square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+        { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+          { -1, 1 }, { -1, 0 } },
+        { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+          { -2, 2 }, { -2, 0 } },
+        { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+          { -4, 4 }, { -4, 0 } },
+        { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+          { -8, 8 }, { -8, 0 } },
+        { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+          { 0, 16 }, { -16, 16 }, { -16, 0 } },
+        { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+          { 0, 32 }, { -32, 32 }, { -32, 0 } },
+        { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+          { 0, 64 }, { -64, 64 }, { -64, 0 } },
+        { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
+          { 0, 128 }, { -128, 128 }, { -128, 0 } },
+        { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
+          { 0, 256 }, { -256, 256 }, { -256, 0 } },
+        { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
+          { 0, 512 }, { -512, 512 }, { -512, 0 } },
+        { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+          { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+      };
+  /* clang-format on */
+  return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+                        cost_list, vfp, use_mvcost, center_mv,
+                        square_num_candidates, square_candidates);
 }
 
-int av1_fast_hex_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
-                        int sad_per_bit,
-                        int do_init_search,  // must be zero for fast_hex
-                        int *cost_list, const aom_variance_fn_ptr_t *vfp,
-                        int use_mvcost, const MV *center_mv, MV *best_mv) {
+static int fast_hex_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+                           int sad_per_bit,
+                           int do_init_search,  // must be zero for fast_hex
+                           int *cost_list, const aom_variance_fn_ptr_t *vfp,
+                           int use_mvcost, const MV *center_mv) {
   return av1_hex_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
                         sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
-                        center_mv, best_mv);
+                        center_mv);
 }
 
-int av1_fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
-                        int sad_per_bit, int do_init_search, int *cost_list,
-                        const aom_variance_fn_ptr_t *vfp, int use_mvcost,
-                        const MV *center_mv, MV *best_mv) {
-  return av1_bigdia_search(
-      x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
-      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+static int fast_dia_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+                           int sad_per_bit, int do_init_search, int *cost_list,
+                           const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                           const MV *center_mv) {
+  return bigdia_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+                       sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+                       center_mv);
 }
 
 #undef CHECK_BETTER
 
 // Exhuastive motion search around a given centre position with a given
 // step size.
-static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+static int exhuastive_mesh_search(MACROBLOCK *x, MV *ref_mv, MV *best_mv,
                                   int range, int step, int sad_per_bit,
                                   const aom_variance_fn_ptr_t *fn_ptr,
                                   const MV *center_mv) {
@@ -1723,6 +1412,7 @@
           sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
           if (sad < best_sad) {
             best_sad = sad;
+            x->second_best_mv.as_mv = *best_mv;
             *best_mv = mv;
           }
         }
@@ -1744,6 +1434,7 @@
                   sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
               if (sad < best_sad) {
                 best_sad = sad;
+                x->second_best_mv.as_mv = *best_mv;
                 *best_mv = mv;
               }
             }
@@ -1758,6 +1449,7 @@
               sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
               if (sad < best_sad) {
                 best_sad = sad;
+                x->second_best_mv.as_mv = *best_mv;
                 *best_mv = mv;
               }
             }
@@ -1770,7 +1462,7 @@
   return best_sad;
 }
 
-int av1_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
+int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
                              MV *ref_mv, MV *best_mv, int search_param,
                              int sad_per_bit, int *num00,
                              const aom_variance_fn_ptr_t *fn_ptr,
@@ -1878,6 +1570,7 @@
       }
     }
     if (best_site != last_site) {
+      x->second_best_mv.as_mv = *best_mv;
       best_mv->row += ss[best_site].mv.row;
       best_mv->col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
@@ -1985,11 +1678,11 @@
                                            int mi_col) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
-  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
   int idx;
   const int bw = 4 << b_width_log2_lookup[bsize];
   const int bh = 4 << b_height_log2_lookup[bsize];
@@ -2112,18 +1805,18 @@
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-int av1_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
-                           int step_param, int sadpb, int further_steps,
-                           int do_refine, int *cost_list,
-                           const aom_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, MV *dst_mv) {
+static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine, int *cost_list,
+                              const aom_variance_fn_ptr_t *fn_ptr,
+                              const MV *ref_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
   int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
                                         step_param, sadpb, &n, fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
-  *dst_mv = temp_mv;
+  x->best_mv.as_mv = temp_mv;
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
@@ -2146,7 +1839,7 @@
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        *dst_mv = temp_mv;
+        x->best_mv.as_mv = temp_mv;
       }
     }
   }
@@ -2154,20 +1847,20 @@
   // final 1-away diamond refining search
   if (do_refine) {
     const int search_range = 8;
-    MV best_mv = *dst_mv;
+    MV best_mv = x->best_mv.as_mv;
     thissme = av1_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
                                       ref_mv);
     if (thissme < INT_MAX)
       thissme = av1_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
       bestsme = thissme;
-      *dst_mv = best_mv;
+      x->best_mv.as_mv = best_mv;
     }
   }
 
   // Return cost list.
   if (cost_list) {
-    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
   }
   return bestsme;
 }
@@ -2178,7 +1871,8 @@
 // Runs an limited range exhaustive mesh search using a pattern set
 // according to the encode speed profile.
 static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                 MV *centre_mv_full, int sadpb, int *cost_list,
+                                 const MV *centre_mv_full, int sadpb,
+                                 int *cost_list,
                                  const aom_variance_fn_ptr_t *fn_ptr,
                                  const MV *ref_mv, MV *dst_mv) {
   const SPEED_FEATURES *const sf = &cpi->sf;
@@ -2425,7 +2119,7 @@
   return best_sad;
 }
 
-int av1_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
                             const aom_variance_fn_ptr_t *fn_ptr,
                             const MV *center_mv) {
@@ -2489,6 +2183,7 @@
     if (best_site == -1) {
       break;
     } else {
+      x->second_best_mv.as_mv = *ref_mv;
       ref_mv->row += neighbors[best_site].row;
       ref_mv->col += neighbors[best_site].col;
       best_address = get_buf_from_mv(in_what, ref_mv);
@@ -2500,8 +2195,7 @@
 
 // This function is called when we do joint motion search in comp_inter_inter
 // mode.
-int av1_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
-                             int search_range,
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              const aom_variance_fn_ptr_t *fn_ptr,
                              const MV *center_mv, const uint8_t *second_pred) {
   const MV neighbors[8] = { { -1, 0 },  { 0, -1 }, { 0, 1 },  { 1, 0 },
@@ -2510,18 +2204,19 @@
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  MV *best_mv = &x->best_mv.as_mv;
   unsigned int best_sad =
-      fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+      fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
                    in_what->stride, second_pred) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+      mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
     for (j = 0; j < 8; ++j) {
-      const MV mv = { ref_mv->row + neighbors[j].row,
-                      ref_mv->col + neighbors[j].col };
+      const MV mv = { best_mv->row + neighbors[j].row,
+                      best_mv->col + neighbors[j].col };
 
       if (is_mv_in(x, &mv)) {
         unsigned int sad =
@@ -2540,6 +2235,543 @@
     if (best_site == -1) {
       break;
     } else {
+      best_mv->row += neighbors[best_site].row;
+      best_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex =
+      AOMMAX(MIN_EX_SEARCH_LIMIT,
+             (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+         (sf->exhaustive_searches_thresh < INT_MAX) &&
+         (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
+}
+
+int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+                          MV *mvp_full, int step_param, int error_per_bit,
+                          int *cost_list, const MV *ref_mv, int var_max,
+                          int rd) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const SEARCH_METHODS method = sf->mv.search_method;
+  const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  int var = 0;
+
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
+  switch (method) {
+    case FAST_DIAMOND:
+      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case HEX:
+      var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                           fn_ptr, 1, ref_mv);
+      break;
+    case SQUARE:
+      var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                          fn_ptr, 1, ref_mv);
+      break;
+    case BIGDIA:
+      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+                          fn_ptr, 1, ref_mv);
+      break;
+    case NSTEP:
+      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+                               cost_list, fn_ptr, ref_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>=
+            8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+          int var_ex;
+          MV tmp_mv_ex;
+          var_ex =
+              full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+                                    cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            x->best_mv.as_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+
+      break;
+    default: assert(0 && "Invalid search method.");
+  }
+
+  if (method != NSTEP && rd && var < var_max)
+    var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
+
+  return var;
+}
+
+#if CONFIG_EXT_INTER
+/* returns subpixel variance error function */
+#define DIST(r, c)                                                         \
+  vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, src_stride, \
+            mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+
+#define MVC(r, c)                                                         \
+  (mvcost                                                                 \
+       ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + mvcost[0][((r)-rr)] + \
+           mvcost[1][((c)-rc)]) *                                         \
+              error_per_bit +                                             \
+          4096) >>                                                        \
+             13                                                           \
+       : 0)
+
+#define CHECK_BETTER(v, r, c)                             \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+    thismse = (DIST(r, c));                               \
+    if ((v = MVC(r, c) + thismse) < besterr) {            \
+      besterr = v;                                        \
+      br = r;                                             \
+      bc = c;                                             \
+      *distortion = thismse;                              \
+      *sse1 = sse;                                        \
+    }                                                     \
+  } else {                                                \
+    v = INT_MAX;                                          \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c)                                                 \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
+    thismse = upsampled_masked_pref_error(xd, mask, mask_stride, vfp, z,       \
+                                          src_stride, upre(y, y_stride, r, c), \
+                                          y_stride, w, h, &sse);               \
+    if ((v = MVC(r, c) + thismse) < besterr) {                                 \
+      besterr = v;                                                             \
+      br = r;                                                                  \
+      bc = c;                                                                  \
+      *distortion = thismse;                                                   \
+      *sse1 = sse;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    v = INT_MAX;                                                               \
+  }
+
+int av1_find_best_masked_sub_pixel_tree(
+    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  int thismse;
+  unsigned int whichdir;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
+
+  const int y_stride = xd->plane[0].pre[is_second].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[is_second].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  const int minc = AOMMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = AOMMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = AOMMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = AOMMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+
+  int tr = br;
+  int tc = bc;
+
+  // central mv
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // calculate central point error
+  besterr =
+      vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+  // 1/2 pel
+  FIRST_LEVEL_CHECKS;
+  if (halfiters > 1) {
+    SECOND_LEVEL_CHECKS;
+  }
+  tr = br;
+  tc = bc;
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+static unsigned int setup_masked_center_error(
+    const uint8_t *mask, int mask_stride, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride, const uint8_t *const y,
+    int y_stride, int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+    int *distortion) {
+  unsigned int besterr;
+  besterr =
+      vfp->mvf(y + offset, y_stride, src, src_stride, mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
+                                       const uint8_t *mask, int mask_stride,
+                                       const aom_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y, int y_stride,
+                                       int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    aom_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride, mask,
+                       mask_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+  (void)xd;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    aom_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->mvf(pred, w, src, src_stride, mask, mask_stride, sse);
+#if CONFIG_AOM_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_masked_center_error(
+    const MACROBLOCKD *xd, const uint8_t *mask, int mask_stride,
+    const MV *bestmv, const MV *ref_mv, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+    const int src_stride, const uint8_t *const y, int y_stride, int w, int h,
+    int offset, int *mvjcost, int *mvcost[2], unsigned int *sse1,
+    int *distortion) {
+  unsigned int besterr =
+      upsampled_masked_pref_error(xd, mask, mask_stride, vfp, src, src_stride,
+                                  y + offset, y_stride, w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int av1_find_best_masked_sub_pixel_tree_up(
+    const AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask, int mask_stride,
+    int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const aom_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *mvjcost, int *mvcost[2], int *distortion,
+    unsigned int *sse1, int is_second, int use_upsampled_ref) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  const int minc = AOMMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = AOMMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = AOMMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = AOMMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = 4 * num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int h = 4 * num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!allow_hp)
+    if (round == 3) round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_masked_center_error(
+        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z,
+        src_stride, y, y_stride, w, h, (offset * 8), mvjcost, mvcost, sse1,
+        distortion);
+  else
+    besterr = setup_masked_center_error(
+        mask, mask_stride, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y,
+        y_stride, offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = { tr, tc };
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_masked_pref_error(
+              xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
+              y_stride, w, h, &sse);
+        } else {
+          const uint8_t *const pre_address =
+              y + (tr >> 3) * y_stride + (tc >> 3);
+          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, mask, mask_stride, &sse);
+        }
+
+        cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+                                                mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = { tr, tc };
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_masked_pref_error(
+            xd, mask, mask_stride, vfp, src_address, src_stride, pre_address,
+            y_stride, w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+                            src_stride, mask, mask_stride, &sse);
+      }
+
+      cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                                            error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void)tr;
+  (void)tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_masked_mvpred_var(const MACROBLOCK *x, const uint8_t *mask,
+                                 int mask_stride, const MV *best_mv,
+                                 const MV *center_mv,
+                                 const aom_variance_fn_ptr_t *vfp,
+                                 int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+  unsigned int unused;
+
+  return vfp->mvf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+                  in_what->stride, mask, mask_stride, &unused) +
+         (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+                                   x->errorperbit)
+                     : 0);
+}
+
+int masked_refining_search_sad(const MACROBLOCK *x, const uint8_t *mask,
+                               int mask_stride, MV *ref_mv, int error_per_bit,
+                               int search_range,
+                               const aom_variance_fn_ptr_t *fn_ptr,
+                               const MV *center_mv, int is_second) {
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  unsigned int best_sad =
+      fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
+                   in_what->stride, mask, mask_stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = { ref_mv->row + neighbors[j].row,
+                      ref_mv->col + neighbors[j].col };
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad =
+            fn_ptr->msdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+                         in_what->stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
       ref_mv->row += neighbors[best_site].row;
       ref_mv->col += neighbors[best_site].col;
     }
@@ -2547,93 +2779,156 @@
   return best_sad;
 }
 
-#define MIN_EX_SEARCH_LIMIT 128
-static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const int max_ex =
-      AOMMAX(MIN_EX_SEARCH_LIMIT,
-             (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+int masked_diamond_search_sad(const MACROBLOCK *x,
+                              const search_site_config *cfg,
+                              const uint8_t *mask, int mask_stride, MV *ref_mv,
+                              MV *best_mv, int search_param, int sad_per_bit,
+                              int *num00, const aom_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
 
-  return sf->allow_exhaustive_searches &&
-         (sf->exhaustive_searches_thresh < INT_MAX) &&
-         (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
-}
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = get_buf_from_mv(in_what, ref_mv);
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
 
-int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int error_per_bit,
-                          int *cost_list, const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const SEARCH_METHODS method = sf->mv.search_method;
-  const aom_variance_fn_ptr_t *const fn_ptr = &cpi->fn_ptr[bsize];
-  int var = 0;
-  if (cost_list) {
-    cost_list[0] = INT_MAX;
-    cost_list[1] = INT_MAX;
-    cost_list[2] = INT_MAX;
-    cost_list[3] = INT_MAX;
-    cost_list[4] = INT_MAX;
-  }
+  // Check the starting position
+  best_sad = fn_ptr->msdf(what->buf, what->stride, best_address,
+                          in_what->stride, mask, mask_stride) +
+             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
-  // Keep track of number of searches (this frame in this thread).
-  ++(*x->m_search_count_ptr);
+  i = 1;
 
-  switch (method) {
-    case FAST_DIAMOND:
-      var = av1_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case FAST_HEX:
-      var = av1_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case HEX:
-      var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
-                           fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case SQUARE:
-      var = av1_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case BIGDIA:
-      var = av1_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case NSTEP:
-      var = av1_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                                   MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                                   cost_list, fn_ptr, ref_mv, tmp_mv);
-
-      // Should we allow a follow on exhaustive search?
-      if (is_exhaustive_allowed(cpi, x)) {
-        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
-        exhuastive_thr >>=
-            8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-
-        // Threshold variance for an exhaustive full search.
-        if (var > exhuastive_thr) {
-          int var_ex;
-          MV tmp_mv_ex;
-          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit,
-                                         cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
-
-          if (var_ex < var) {
-            var = var_ex;
-            *tmp_mv = tmp_mv_ex;
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = { best_mv->row + ss[i].mv.row,
+                      best_mv->col + ss[i].mv.col };
+      if (is_mv_in(x, &mv)) {
+        int sad =
+            fn_ptr->msdf(what->buf, what->stride, best_address + ss[i].offset,
+                         in_what->stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
           }
         }
       }
-      break;
 
-      break;
-    default: assert(0 && "Invalid search method.");
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+                             best_mv->col + ss[best_site].mv.col };
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->msdf(what->buf, what->stride,
+                                 best_address + ss[best_site].offset,
+                                 in_what->stride, mask, mask_stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int av1_masked_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+                                  const uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param, int sadpb,
+                                  int further_steps, int do_refine,
+                                  const aom_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv, int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg, mask, mask_stride,
+                                          mvp_full, &temp_mv, step_param, sadpb,
+                                          &n, fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+                                    fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = masked_diamond_search_sad(
+          x, &cpi->ss_cfg, mask, mask_stride, mvp_full, &temp_mv,
+          step_param + n, sadpb, &num00, fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+                                        fn_ptr, 1, is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
   }
 
-  if (method != NSTEP && rd && var < var_max)
-    var = av1_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
-
-  return var;
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme =
+        masked_refining_search_sad(x, mask, mask_stride, &best_mv, sadpb,
+                                   search_range, fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_masked_mvpred_var(x, mask, mask_stride, &best_mv, ref_mv,
+                                      fn_ptr, 1, is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
 }
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR
 /* returns subpixel variance error function */
@@ -2780,6 +3075,7 @@
     int ref = xd->mi[0]->mbmi.ref_frame[is_second];
     const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
     setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
                      upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
                      NULL, pd->subsampling_x, pd->subsampling_y);
   }
@@ -2796,7 +3092,7 @@
   if (use_upsampled_ref)
     besterr = upsampled_setup_obmc_center_error(
         xd, mask, bestmv, ref_mv, error_per_bit, vfp, z, y, y_stride, w, h,
-        (offset << 3), mvjcost, mvcost, sse1, distortion);
+        (offset * 8), mvjcost, mvcost, sse1, distortion);
   else
     besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
                                       z, y, y_stride, offset, mvjcost, mvcost,

diff --git a/av1/encoder/mcomp.h b/av1/encoder/mcomp.h
index f5047e2..e244a3f 100644
--- a/av1/encoder/mcomp.h
+++ b/av1/encoder/mcomp.h

@@ -63,9 +63,9 @@
 
 int av1_init_search_range(int size);
 
-int av1_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
+int av1_refining_search_sad(struct macroblock *x, struct mv *ref_mv,
                             int sad_per_bit, int distance,
-                            const struct aom_variance_vtable *fn_ptr,
+                            const aom_variance_fn_ptr_t *fn_ptr,
                             const struct mv *center_mv);
 
 // Runs sequence of diamond searches in smaller steps for RD.
@@ -80,22 +80,14 @@
                                            MACROBLOCK *x, BLOCK_SIZE bsize,
                                            int mi_row, int mi_col);
 
-typedef int(integer_mv_pattern_search_fn)(const MACROBLOCK *x, MV *ref_mv,
-                                          int search_param, int error_per_bit,
-                                          int do_init_search, int *cost_list,
-                                          const aom_variance_fn_ptr_t *vf,
-                                          int use_mvcost, const MV *center_mv,
-                                          MV *best_mv);
-
-integer_mv_pattern_search_fn av1_hex_search;
-integer_mv_pattern_search_fn av1_bigdia_search;
-integer_mv_pattern_search_fn av1_square_search;
-integer_mv_pattern_search_fn av1_fast_hex_search;
-integer_mv_pattern_search_fn av1_fast_dia_search;
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+                   int sad_per_bit, int do_init_search, int *cost_list,
+                   const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+                   const MV *center_mv);
 
 typedef int(fractional_mv_step_fp)(
-    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
-    int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    MACROBLOCK *x, const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1, const uint8_t *second_pred, int w,
@@ -111,18 +103,12 @@
                                     const aom_variance_fn_ptr_t *fn_ptr,
                                     const MV *center_mv, MV *best_mv);
 
-typedef int (*av1_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv,
-                                        int sad_per_bit, int distance,
-                                        const aom_variance_fn_ptr_t *fn_ptr,
-                                        const MV *center_mv);
-
 typedef int (*av1_diamond_search_fn_t)(
-    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
+    MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
     int search_param, int sad_per_bit, int *num00,
     const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
 
-int av1_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
-                             int search_range,
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
                              const aom_variance_fn_ptr_t *fn_ptr,
                              const MV *center_mv, const uint8_t *second_pred);
 
@@ -131,7 +117,28 @@
 int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
                           BLOCK_SIZE bsize, MV *mvp_full, int step_param,
                           int error_per_bit, int *cost_list, const MV *ref_mv,
-                          MV *tmp_mv, int var_max, int rd);
+                          int var_max, int rd);
+
+#if CONFIG_EXT_INTER
+int av1_find_best_masked_sub_pixel_tree(
+    const MACROBLOCK *x, const uint8_t *mask, int mask_stride, MV *bestmv,
+    const MV *ref_mv, int allow_hp, int error_per_bit,
+    const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+    int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+    int is_second);
+int av1_find_best_masked_sub_pixel_tree_up(
+    const struct AV1_COMP *cpi, MACROBLOCK *x, const uint8_t *mask,
+    int mask_stride, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv,
+    int allow_hp, int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+    int forced_stop, int iters_per_step, int *mvjcost, int *mvcost[2],
+    int *distortion, unsigned int *sse1, int is_second, int use_upsampled_ref);
+int av1_masked_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                                  const uint8_t *mask, int mask_stride,
+                                  MV *mvp_full, int step_param, int sadpb,
+                                  int further_steps, int do_refine,
+                                  const aom_variance_fn_ptr_t *fn_ptr,
+                                  const MV *ref_mv, MV *dst_mv, int is_second);
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_MOTION_VAR
 int av1_obmc_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
@@ -146,7 +153,6 @@
     int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
     int is_second, int use_upsampled_ref);
 #endif  // CONFIG_MOTION_VAR
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index e3cd638..56ee074 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c

@@ -14,8 +14,8 @@
 
 #include "./aom_scale_rtcd.h"
 
-#include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/psnr.h"
+#include "aom_dsp/aom_dsp_common.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
@@ -27,7 +27,7 @@
 #include "av1/encoder/picklpf.h"
 #include "av1/encoder/quantize.h"
 
-static int get_max_filter_level(const AV1_COMP *cpi) {
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
     return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
                                                  : MAX_LOOP_FILTER;
@@ -42,6 +42,10 @@
   AV1_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1,
+                        partial_frame);
+#else
   if (cpi->num_workers > 1)
     av1_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
                              filt_level, 1, partial_frame, cpi->workers,
@@ -49,6 +53,7 @@
   else
     av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
                           1, partial_frame);
+#endif
 
 #if CONFIG_AOM_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
@@ -66,15 +71,16 @@
   return filt_err;
 }
 
-static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
-                               int partial_frame) {
+int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                            int partial_frame, double *best_cost_ret) {
   const AV1_COMMON *const cm = &cpi->common;
   const struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
-  const int max_filter_level = get_max_filter_level(cpi);
+  const int max_filter_level = av1_get_max_filter_level(cpi);
   int filt_direction = 0;
   int64_t best_err;
   int filt_best;
+  MACROBLOCK *x = &cpi->td.mb;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
@@ -113,10 +119,11 @@
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
-      if ((ss_err[filt_low] - bias) < best_err) {
+      if (ss_err[filt_low] < (best_err + bias)) {
         // Was it actually better than the previous best?
-        if (ss_err[filt_low] < best_err) best_err = ss_err[filt_low];
-
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
         filt_best = filt_low;
       }
     }
@@ -126,7 +133,8 @@
       if (ss_err[filt_high] < 0) {
         ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
       }
-      // Was it better than the previous best?
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
       if (ss_err[filt_high] < (best_err - bias)) {
         best_err = ss_err[filt_high];
         filt_best = filt_high;
@@ -143,9 +151,15 @@
     }
   }
 
+  // Update best error
+  best_err = ss_err[filt_best];
+
+  if (best_cost_ret)
+    *best_cost_ret = RDCOST_DBL(x->rdmult, x->rddiv, 0, best_err);
   return filt_best;
 }
 
+#if !CONFIG_LOOP_RESTORATION
 void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
                            LPF_PICK_METHOD method) {
   AV1_COMMON *const cm = &cpi->common;
@@ -157,7 +171,7 @@
     lf->filter_level = 0;
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
-    const int max_filter_level = get_max_filter_level(cpi);
+    const int max_filter_level = av1_get_max_filter_level(cpi);
     const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth);
 // These values were determined by linear fitting the result of the
 // searched level, filt_guess = q * 0.316206 + 3.87252
@@ -185,7 +199,15 @@
     if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-    lf->filter_level =
-        search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE);
+    lf->filter_level = av1_search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
   }
+
+#if CONFIG_EXT_TILE
+  // TODO(any): 0 loopfilter level is only necessary if individual tile
+  // decoding is required. We need to communicate this requirement to this
+  // code and force loop filter level 0 only if required.
+  lf->filter_level = 0;
+#endif  // CONFIG_EXT_TILE
 }
+#endif  // !CONFIG_LOOP_RESTORATION

diff --git a/av1/encoder/picklpf.h b/av1/encoder/picklpf.h
index 3d5c9a8..3c0a834 100644
--- a/av1/encoder/picklpf.h
+++ b/av1/encoder/picklpf.h

@@ -20,7 +20,9 @@
 
 struct yv12_buffer_config;
 struct AV1_COMP;
-
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+int av1_search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                            int partial_frame, double *err);
 void av1_pick_filter_level(const struct yv12_buffer_config *sd,
                            struct AV1_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus

diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
new file mode 100644
index 0000000..62303b7
--- /dev/null
+++ b/av1/encoder/pickrst.c

@@ -0,0 +1,805 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./aom_scale_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/quantize.h"
+
+typedef double (*search_restore_type)(const YV12_BUFFER_CONFIG *src,
+                                      AV1_COMP *cpi, int filter_level,
+                                      int partial_frame, RestorationInfo *info,
+                                      double *best_tile_cost);
+
+const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
+
+static int64_t sse_restoration_tile(const YV12_BUFFER_CONFIG *src,
+                                    AV1_COMMON *const cm, int h_start,
+                                    int width, int v_start, int height) {
+  int64_t filt_err;
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = aom_highbd_get_y_sse_part(src, cm->frame_to_show, h_start, width,
+                                         v_start, height);
+  } else {
+    filt_err = aom_get_y_sse_part(src, cm->frame_to_show, h_start, width,
+                                  v_start, height);
+  }
+#else
+  filt_err = aom_get_y_sse_part(src, cm->frame_to_show, h_start, width, v_start,
+                                height);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  return filt_err;
+}
+
+static int64_t try_restoration_tile(const YV12_BUFFER_CONFIG *src,
+                                    AV1_COMP *const cpi, RestorationInfo *rsi,
+                                    int partial_frame, int tile_idx,
+                                    int subtile_idx, int subtile_bits) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+  int tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, &tile_width,
+                                         &tile_height, &nhtiles, &nvtiles);
+  (void)ntiles;
+
+  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, 1, partial_frame);
+  av1_get_rest_tile_limits(tile_idx, subtile_idx, subtile_bits, nhtiles,
+                           nvtiles, tile_width, tile_height, cm->width,
+                           cm->height, 0, 0, &h_start, &h_end, &v_start,
+                           &v_end);
+  filt_err = sse_restoration_tile(src, cm, h_start, h_end - h_start, v_start,
+                                  v_end - v_start);
+
+  // Re-instate the unfiltered frame
+  aom_yv12_copy_y(&cpi->last_frame_db, cm->frame_to_show);
+  return filt_err;
+}
+
+static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *src,
+                                     AV1_COMP *const cpi, RestorationInfo *rsi,
+                                     int partial_frame) {
+  AV1_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+  av1_loop_restoration_frame(cm->frame_to_show, cm, rsi, 1, partial_frame);
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = aom_highbd_get_y_sse(src, cm->frame_to_show);
+  } else {
+    filt_err = aom_get_y_sse(src, cm->frame_to_show);
+  }
+#else
+  filt_err = aom_get_y_sse(src, cm->frame_to_show);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  aom_yv12_copy_y(&cpi->last_frame_db, cm->frame_to_show);
+  return filt_err;
+}
+
+static double search_bilateral(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                               int filter_level, int partial_frame,
+                               RestorationInfo *info, double *best_tile_cost) {
+  BilateralInfo *bilateral_info = info->bilateral_info;
+  AV1_COMMON *const cm = &cpi->common;
+  int i, tile_idx, subtile_idx;
+  int64_t err;
+  int bits;
+  double cost, best_cost, cost_bilateral, cost_norestore_subtile;
+  const int bilateral_level_bits = av1_bilateral_level_bits(&cpi->common);
+  const int bilateral_levels = 1 << bilateral_level_bits;
+  MACROBLOCK *x = &cpi->td.mb;
+  RestorationInfo rsi;
+  int tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, &tile_width,
+                                         &tile_height, &nhtiles, &nvtiles);
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                        1, partial_frame);
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  rsi.frame_restoration_type = RESTORE_BILATERAL;
+  rsi.bilateral_info =
+      (BilateralInfo *)aom_malloc(sizeof(*rsi.bilateral_info) * ntiles);
+  assert(rsi.bilateral_info != NULL);
+
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx)
+    for (subtile_idx = 0; subtile_idx < BILATERAL_SUBTILES; ++subtile_idx)
+      bilateral_info[tile_idx].level[subtile_idx] =
+          rsi.bilateral_info[tile_idx].level[subtile_idx] = -1;
+
+  // Find best filter for each tile
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    for (subtile_idx = 0; subtile_idx < BILATERAL_SUBTILES; ++subtile_idx) {
+      av1_get_rest_tile_limits(tile_idx, subtile_idx, BILATERAL_SUBTILE_BITS,
+                               nhtiles, nvtiles, tile_width, tile_height,
+                               cm->width, cm->height, 0, 0, &h_start, &h_end,
+                               &v_start, &v_end);
+      err = sse_restoration_tile(src, cm, h_start, h_end - h_start, v_start,
+                                 v_end - v_start);
+#if BILATERAL_SUBTILES
+      // #bits when a subtile is not restored
+      bits = av1_cost_bit(RESTORE_NONE_BILATERAL_PROB, 0);
+#else
+      bits = 0;
+#endif
+      cost_norestore_subtile =
+          RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+      best_cost = cost_norestore_subtile;
+
+      for (i = 0; i < bilateral_levels; ++i) {
+        rsi.bilateral_info[tile_idx].level[subtile_idx] = i;
+        err = try_restoration_tile(src, cpi, &rsi, partial_frame, tile_idx,
+                                   subtile_idx, BILATERAL_SUBTILE_BITS);
+        bits = bilateral_level_bits << AV1_PROB_COST_SHIFT;
+        bits += av1_cost_bit(RESTORE_NONE_BILATERAL_PROB, 1);
+        cost = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+        if (cost < best_cost) {
+          bilateral_info[tile_idx].level[subtile_idx] = i;
+          best_cost = cost;
+        }
+        rsi.bilateral_info[tile_idx].level[subtile_idx] = -1;
+      }
+    }
+    bits = 0;
+    for (subtile_idx = 0; subtile_idx < BILATERAL_SUBTILES; ++subtile_idx) {
+      rsi.bilateral_info[tile_idx].level[subtile_idx] =
+          bilateral_info[tile_idx].level[subtile_idx];
+      if (rsi.bilateral_info[tile_idx].level[subtile_idx] >= 0)
+        bits += bilateral_level_bits << AV1_PROB_COST_SHIFT;
+#if BILATERAL_SUBTILES
+      bits +=
+          av1_cost_bit(RESTORE_NONE_BILATERAL_PROB,
+                       rsi.bilateral_info[tile_idx].level[subtile_idx] >= 0);
+#endif
+    }
+    err = try_restoration_tile(src, cpi, &rsi, partial_frame, tile_idx, 0, 0);
+    best_tile_cost[tile_idx] = RDCOST_DBL(
+        x->rdmult, x->rddiv,
+        (bits + cpi->switchable_restore_cost[RESTORE_BILATERAL]) >> 4, err);
+  }
+  // Find cost for combined configuration
+  bits = frame_level_restore_bits[rsi.frame_restoration_type]
+         << AV1_PROB_COST_SHIFT;
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    for (subtile_idx = 0; subtile_idx < BILATERAL_SUBTILES; ++subtile_idx) {
+      rsi.bilateral_info[tile_idx].level[subtile_idx] =
+          bilateral_info[tile_idx].level[subtile_idx];
+      if (rsi.bilateral_info[tile_idx].level[subtile_idx] >= 0) {
+        bits += bilateral_level_bits << AV1_PROB_COST_SHIFT;
+      }
+#if BILATERAL_SUBTILES
+      bits +=
+          av1_cost_bit(RESTORE_NONE_BILATERAL_PROB,
+                       rsi.bilateral_info[tile_idx].level[subtile_idx] >= 0);
+#endif
+    }
+  }
+  err = try_restoration_frame(src, cpi, &rsi, partial_frame);
+  cost_bilateral = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+
+  aom_free(rsi.bilateral_info);
+
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  return cost_bilateral;
+}
+
+static double find_average(uint8_t *src, int h_start, int h_end, int v_start,
+                           int v_end, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  for (i = v_start; i < v_end; i++)
+    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+  return avg;
+}
+
+static void compute_stats(uint8_t *dgd, uint8_t *src, int h_start, int h_end,
+                          int v_start, int v_end, int dgd_stride,
+                          int src_stride, double *M, double *H) {
+  int i, j, k, l;
+  double Y[RESTORATION_WIN2];
+  const double avg =
+      find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * RESTORATION_WIN2);
+  memset(H, 0, sizeof(*H) * RESTORATION_WIN2 * RESTORATION_WIN2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -RESTORATION_HALFWIN; k <= RESTORATION_HALFWIN; k++) {
+        for (l = -RESTORATION_HALFWIN; l <= RESTORATION_HALFWIN; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      for (k = 0; k < RESTORATION_WIN2; ++k) {
+        M[k] += Y[k] * X;
+        H[k * RESTORATION_WIN2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < RESTORATION_WIN2; ++l) {
+          double value = Y[k] * Y[l];
+          H[k * RESTORATION_WIN2 + l] += value;
+          H[l * RESTORATION_WIN2 + k] += value;
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static double find_average_highbd(uint16_t *src, int h_start, int h_end,
+                                  int v_start, int v_end, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  for (i = v_start; i < v_end; i++)
+    for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+  avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+  return avg;
+}
+
+static void compute_stats_highbd(uint8_t *dgd8, uint8_t *src8, int h_start,
+                                 int h_end, int v_start, int v_end,
+                                 int dgd_stride, int src_stride, double *M,
+                                 double *H) {
+  int i, j, k, l;
+  double Y[RESTORATION_WIN2];
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const double avg =
+      find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * RESTORATION_WIN2);
+  memset(H, 0, sizeof(*H) * RESTORATION_WIN2 * RESTORATION_WIN2);
+  for (i = v_start; i < v_end; i++) {
+    for (j = h_start; j < h_end; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -RESTORATION_HALFWIN; k <= RESTORATION_HALFWIN; k++) {
+        for (l = -RESTORATION_HALFWIN; l <= RESTORATION_HALFWIN; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      for (k = 0; k < RESTORATION_WIN2; ++k) {
+        M[k] += Y[k] * X;
+        H[k * RESTORATION_WIN2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < RESTORATION_WIN2; ++l) {
+          double value = Y[k] * Y[l];
+          H[k * RESTORATION_WIN2 + l] += value;
+          H[l * RESTORATION_WIN2 + k] += value;
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+// Solves Ax = b, where x and b are column vectors
+static int linsolve(int n, double *A, int stride, double *b, double *x) {
+  int i, j, k;
+  double c;
+  // Partial pivoting
+  for (i = n - 1; i > 0; i--) {
+    if (A[(i - 1) * stride] < A[i * stride]) {
+      for (j = 0; j < n; j++) {
+        c = A[i * stride + j];
+        A[i * stride + j] = A[(i - 1) * stride + j];
+        A[(i - 1) * stride + j] = c;
+      }
+      c = b[i];
+      b[i] = b[i - 1];
+      b[i - 1] = c;
+    }
+  }
+  // Forward elimination
+  for (k = 0; k < n - 1; k++) {
+    for (i = k; i < n - 1; i++) {
+      c = A[(i + 1) * stride + k] / A[k * stride + k];
+      for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+      b[i + 1] -= c * b[k];
+    }
+  }
+  // Backward substitution
+  for (i = n - 1; i >= 0; i--) {
+    if (fabs(A[i * stride + i]) < 1e-10) return 0;
+    c = 0;
+    for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+    x[i] = (b[i] - c) / A[i * stride + i];
+  }
+  return 1;
+}
+
+static INLINE int wrap_index(int i) {
+  return (i >= RESTORATION_HALFWIN1 ? RESTORATION_WIN - 1 - i : i);
+}
+
+// Fix vector b, update vector a
+static void update_a_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+  int i, j;
+  double S[RESTORATION_WIN];
+  double A[RESTORATION_WIN], B[RESTORATION_WIN2];
+  int w, w2;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < RESTORATION_WIN; i++) {
+    for (j = 0; j < RESTORATION_WIN; ++j) {
+      const int jj = wrap_index(j);
+      A[jj] += Mc[i][j] * b[i];
+    }
+  }
+  for (i = 0; i < RESTORATION_WIN; i++) {
+    for (j = 0; j < RESTORATION_WIN; j++) {
+      int k, l;
+      for (k = 0; k < RESTORATION_WIN; ++k)
+        for (l = 0; l < RESTORATION_WIN; ++l) {
+          const int kk = wrap_index(k);
+          const int ll = wrap_index(l);
+          B[ll * RESTORATION_HALFWIN1 + kk] +=
+              Hc[j * RESTORATION_WIN + i][k * RESTORATION_WIN2 + l] * b[i] *
+              b[j];
+        }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  w = RESTORATION_WIN;
+  w2 = (w >> 1) + 1;
+  for (i = 0; i < w2 - 1; ++i)
+    A[i] -=
+        A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+  for (i = 0; i < w2 - 1; ++i)
+    for (j = 0; j < w2 - 1; ++j)
+      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+  if (linsolve(w2 - 1, B, w2, A, S)) {
+    S[w2 - 1] = 1.0;
+    for (i = w2; i < w; ++i) {
+      S[i] = S[w - 1 - i];
+      S[w2 - 1] -= 2 * S[i];
+    }
+    memcpy(a, S, w * sizeof(*a));
+  }
+}
+
+// Fix vector a, update vector b
+static void update_b_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+  int i, j;
+  double S[RESTORATION_WIN];
+  double A[RESTORATION_WIN], B[RESTORATION_WIN2];
+  int w, w2;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < RESTORATION_WIN; i++) {
+    const int ii = wrap_index(i);
+    for (j = 0; j < RESTORATION_WIN; j++) A[ii] += Mc[i][j] * a[j];
+  }
+
+  for (i = 0; i < RESTORATION_WIN; i++) {
+    for (j = 0; j < RESTORATION_WIN; j++) {
+      const int ii = wrap_index(i);
+      const int jj = wrap_index(j);
+      int k, l;
+      for (k = 0; k < RESTORATION_WIN; ++k)
+        for (l = 0; l < RESTORATION_WIN; ++l)
+          B[jj * RESTORATION_HALFWIN1 + ii] +=
+              Hc[i * RESTORATION_WIN + j][k * RESTORATION_WIN2 + l] * a[k] *
+              a[l];
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  w = RESTORATION_WIN;
+  w2 = RESTORATION_HALFWIN1;
+  for (i = 0; i < w2 - 1; ++i)
+    A[i] -=
+        A[w2 - 1] * 2 + B[i * w2 + w2 - 1] - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+  for (i = 0; i < w2 - 1; ++i)
+    for (j = 0; j < w2 - 1; ++j)
+      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+  if (linsolve(w2 - 1, B, w2, A, S)) {
+    S[w2 - 1] = 1.0;
+    for (i = w2; i < w; ++i) {
+      S[i] = S[w - 1 - i];
+      S[w2 - 1] -= 2 * S[i];
+    }
+    memcpy(b, S, w * sizeof(*b));
+  }
+}
+
+static int wiener_decompose_sep_sym(double *M, double *H, double *a,
+                                    double *b) {
+  static const double init_filt[RESTORATION_WIN] = {
+    0.035623, -0.127154, 0.211436, 0.760190, 0.211436, -0.127154, 0.035623,
+  };
+  int i, j, iter;
+  double *Hc[RESTORATION_WIN2];
+  double *Mc[RESTORATION_WIN];
+  for (i = 0; i < RESTORATION_WIN; i++) {
+    Mc[i] = M + i * RESTORATION_WIN;
+    for (j = 0; j < RESTORATION_WIN; j++) {
+      Hc[i * RESTORATION_WIN + j] =
+          H + i * RESTORATION_WIN * RESTORATION_WIN2 + j * RESTORATION_WIN;
+    }
+  }
+  memcpy(a, init_filt, sizeof(*a) * RESTORATION_WIN);
+  memcpy(b, init_filt, sizeof(*b) * RESTORATION_WIN);
+
+  iter = 1;
+  while (iter < 10) {
+    update_a_sep_sym(Mc, Hc, a, b);
+    update_b_sep_sym(Mc, Hc, a, b);
+    iter++;
+  }
+  return 1;
+}
+
+// Computes the function x'*A*x - x'*b for the learned filters, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static double compute_score(double *M, double *H, int *vfilt, int *hfilt) {
+  double ab[RESTORATION_WIN * RESTORATION_WIN];
+  int i, k, l;
+  double P = 0, Q = 0;
+  double iP = 0, iQ = 0;
+  double Score, iScore;
+  int w;
+  double a[RESTORATION_WIN], b[RESTORATION_WIN];
+  w = RESTORATION_WIN;
+  a[RESTORATION_HALFWIN] = b[RESTORATION_HALFWIN] = 1.0;
+  for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+    a[i] = a[RESTORATION_WIN - i - 1] =
+        (double)vfilt[i] / RESTORATION_FILT_STEP;
+    b[i] = b[RESTORATION_WIN - i - 1] =
+        (double)hfilt[i] / RESTORATION_FILT_STEP;
+    a[RESTORATION_HALFWIN] -= 2 * a[i];
+    b[RESTORATION_HALFWIN] -= 2 * b[i];
+  }
+  for (k = 0; k < w; ++k) {
+    for (l = 0; l < w; ++l) ab[k * w + l] = a[l] * b[k];
+  }
+  for (k = 0; k < w * w; ++k) {
+    P += ab[k] * M[k];
+    for (l = 0; l < w * w; ++l) Q += ab[k] * H[k * w * w + l] * ab[l];
+  }
+  Score = Q - 2 * P;
+
+  iP = M[(w * w) >> 1];
+  iQ = H[((w * w) >> 1) * w * w + ((w * w) >> 1)];
+  iScore = iQ - 2 * iP;
+
+  return Score - iScore;
+}
+
+#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
+#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
+
+static void quantize_sym_filter(double *f, int *fi) {
+  int i;
+  for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+    fi[i] = RINT(f[i] * RESTORATION_FILT_STEP);
+  }
+  // Specialize for 7-tap filter
+  fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+  fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+  fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+}
+
+static double search_wiener(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                            int filter_level, int partial_frame,
+                            RestorationInfo *info, double *best_tile_cost) {
+  WienerInfo *wiener_info = info->wiener_info;
+  AV1_COMMON *const cm = &cpi->common;
+  RestorationInfo rsi;
+  int64_t err;
+  int bits;
+  double cost_wiener, cost_norestore_tile;
+  MACROBLOCK *x = &cpi->td.mb;
+  double M[RESTORATION_WIN2];
+  double H[RESTORATION_WIN2 * RESTORATION_WIN2];
+  double vfilterd[RESTORATION_WIN], hfilterd[RESTORATION_WIN];
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const int width = cm->width;
+  const int height = cm->height;
+  const int src_stride = src->y_stride;
+  const int dgd_stride = dgd->y_stride;
+  double score;
+  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  int i, j;
+
+  const int ntiles = av1_get_rest_ntiles(width, height, &tile_width,
+                                         &tile_height, &nhtiles, &nvtiles);
+  assert(width == dgd->y_crop_width);
+  assert(height == dgd->y_crop_height);
+  assert(width == src->y_crop_width);
+  assert(height == src->y_crop_height);
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                        1, partial_frame);
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  rsi.frame_restoration_type = RESTORE_WIENER;
+  rsi.wiener_info = (WienerInfo *)aom_malloc(sizeof(*rsi.wiener_info) * ntiles);
+  assert(rsi.wiener_info != NULL);
+
+  for (j = 0; j < ntiles; ++j) rsi.wiener_info[j].level = 0;
+
+  // Compute best Wiener filters for each tile
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, width, height, 0, 0, &h_start, &h_end,
+                             &v_start, &v_end);
+    err = sse_restoration_tile(src, cm, h_start, h_end - h_start, v_start,
+                               v_end - v_start);
+    // #bits when a tile is not restored
+    bits = av1_cost_bit(RESTORE_NONE_WIENER_PROB, 0);
+    cost_norestore_tile = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    best_tile_cost[tile_idx] = DBL_MAX;
+
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, width, height, 1, 1, &h_start, &h_end,
+                             &v_start, &v_end);
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (cm->use_highbitdepth)
+      compute_stats_highbd(dgd->y_buffer, src->y_buffer, h_start, h_end,
+                           v_start, v_end, dgd_stride, src_stride, M, H);
+    else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      compute_stats(dgd->y_buffer, src->y_buffer, h_start, h_end, v_start,
+                    v_end, dgd_stride, src_stride, M, H);
+
+    wiener_info[tile_idx].level = 1;
+    if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
+      wiener_info[tile_idx].level = 0;
+      continue;
+    }
+    quantize_sym_filter(vfilterd, rsi.wiener_info[tile_idx].vfilter);
+    quantize_sym_filter(hfilterd, rsi.wiener_info[tile_idx].hfilter);
+
+    // Filter score computes the value of the function x'*A*x - x'*b for the
+    // learned filter and compares it against identity filer. If there is no
+    // reduction in the function, the filter is reverted back to identity
+    score = compute_score(M, H, rsi.wiener_info[tile_idx].vfilter,
+                          rsi.wiener_info[tile_idx].hfilter);
+    if (score > 0.0) {
+      wiener_info[tile_idx].level = 0;
+      continue;
+    }
+
+    rsi.wiener_info[tile_idx].level = 1;
+    err = try_restoration_tile(src, cpi, &rsi, partial_frame, tile_idx, 0, 0);
+    bits = WIENER_FILT_BITS << AV1_PROB_COST_SHIFT;
+    bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, 1);
+    cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+    if (cost_wiener >= cost_norestore_tile) {
+      wiener_info[tile_idx].level = 0;
+    } else {
+      wiener_info[tile_idx].level = 1;
+      for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+        wiener_info[tile_idx].vfilter[i] = rsi.wiener_info[tile_idx].vfilter[i];
+        wiener_info[tile_idx].hfilter[i] = rsi.wiener_info[tile_idx].hfilter[i];
+      }
+      bits = WIENER_FILT_BITS << AV1_PROB_COST_SHIFT;
+      best_tile_cost[tile_idx] = RDCOST_DBL(
+          x->rdmult, x->rddiv,
+          (bits + cpi->switchable_restore_cost[RESTORE_WIENER]) >> 4, err);
+    }
+    rsi.wiener_info[tile_idx].level = 0;
+  }
+  // Cost for Wiener filtering
+  bits = frame_level_restore_bits[rsi.frame_restoration_type]
+         << AV1_PROB_COST_SHIFT;
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    bits += av1_cost_bit(RESTORE_NONE_WIENER_PROB, wiener_info[tile_idx].level);
+    rsi.wiener_info[tile_idx].level = wiener_info[tile_idx].level;
+    if (wiener_info[tile_idx].level) {
+      bits += (WIENER_FILT_BITS << AV1_PROB_COST_SHIFT);
+      for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+        rsi.wiener_info[tile_idx].vfilter[i] = wiener_info[tile_idx].vfilter[i];
+        rsi.wiener_info[tile_idx].hfilter[i] = wiener_info[tile_idx].hfilter[i];
+      }
+    }
+  }
+  err = try_restoration_frame(src, cpi, &rsi, partial_frame);
+  cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+  aom_free(rsi.wiener_info);
+
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  return cost_wiener;
+}
+
+static double search_norestore(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                               int filter_level, int partial_frame,
+                               RestorationInfo *info, double *best_tile_cost) {
+  double err, cost_norestore;
+  int bits;
+  MACROBLOCK *x = &cpi->td.mb;
+  AV1_COMMON *const cm = &cpi->common;
+  int tile_idx, tile_width, tile_height, nhtiles, nvtiles;
+  int h_start, h_end, v_start, v_end;
+  const int ntiles = av1_get_rest_ntiles(cm->width, cm->height, &tile_width,
+                                         &tile_height, &nhtiles, &nvtiles);
+  (void)info;
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                        1, partial_frame);
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    av1_get_rest_tile_limits(tile_idx, 0, 0, nhtiles, nvtiles, tile_width,
+                             tile_height, cm->width, cm->height, 0, 0, &h_start,
+                             &h_end, &v_start, &v_end);
+    err = sse_restoration_tile(src, cm, h_start, h_end - h_start, v_start,
+                               v_end - v_start);
+    best_tile_cost[tile_idx] =
+        RDCOST_DBL(x->rdmult, x->rddiv,
+                   (cpi->switchable_restore_cost[RESTORE_NONE] >> 4), err);
+  }
+  // RD cost associated with no restoration
+  err = sse_restoration_tile(src, cm, 0, cm->width, 0, cm->height);
+  bits = frame_level_restore_bits[RESTORE_NONE] << AV1_PROB_COST_SHIFT;
+  cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv, (bits >> 4), err);
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  return cost_norestore;
+}
+
+static double search_switchable_restoration(
+    AV1_COMP *cpi, int filter_level, int partial_frame, RestorationInfo *rsi,
+    double *tile_cost[RESTORE_SWITCHABLE_TYPES]) {
+  AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *x = &cpi->td.mb;
+  double cost_switchable = 0;
+  int r, bits, tile_idx;
+  const int ntiles =
+      av1_get_rest_ntiles(cm->width, cm->height, NULL, NULL, NULL, NULL);
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                        1, partial_frame);
+  aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  rsi->frame_restoration_type = RESTORE_SWITCHABLE;
+  bits = frame_level_restore_bits[rsi->frame_restoration_type]
+         << AV1_PROB_COST_SHIFT;
+  cost_switchable = RDCOST_DBL(x->rdmult, x->rddiv, bits >> 4, 0);
+  for (tile_idx = 0; tile_idx < ntiles; ++tile_idx) {
+    double best_cost = tile_cost[RESTORE_NONE][tile_idx];
+    rsi->restoration_type[tile_idx] = RESTORE_NONE;
+    for (r = 1; r < RESTORE_SWITCHABLE_TYPES; r++) {
+      if (tile_cost[r][tile_idx] < best_cost) {
+        rsi->restoration_type[tile_idx] = r;
+        best_cost = tile_cost[r][tile_idx];
+      }
+    }
+    cost_switchable += best_cost;
+  }
+  aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  return cost_switchable;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi,
+                                 LPF_PICK_METHOD method) {
+  static search_restore_type search_restore_fun[RESTORE_SWITCHABLE_TYPES] = {
+    search_norestore, search_bilateral, search_wiener,
+  };
+  AV1_COMMON *const cm = &cpi->common;
+  struct loopfilter *const lf = &cm->lf;
+  double cost_restore[RESTORE_TYPES];
+  double *tile_cost[RESTORE_SWITCHABLE_TYPES];
+  double best_cost_restore;
+  RestorationType r, best_restore;
+
+  const int ntiles =
+      av1_get_rest_ntiles(cm->width, cm->height, NULL, NULL, NULL, NULL);
+  cm->rst_info.restoration_type = (RestorationType *)aom_realloc(
+      cm->rst_info.restoration_type,
+      sizeof(*cm->rst_info.restoration_type) * ntiles);
+  cm->rst_info.bilateral_info = (BilateralInfo *)aom_realloc(
+      cm->rst_info.bilateral_info,
+      sizeof(*cm->rst_info.bilateral_info) * ntiles);
+  assert(cm->rst_info.bilateral_info != NULL);
+  cm->rst_info.wiener_info = (WienerInfo *)aom_realloc(
+      cm->rst_info.wiener_info, sizeof(*cm->rst_info.wiener_info) * ntiles);
+  assert(cm->rst_info.wiener_info != NULL);
+
+  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++)
+    tile_cost[r] = (double *)aom_malloc(sizeof(*tile_cost[0]) * ntiles);
+
+  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+
+  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+    lf->filter_level = 0;
+    cm->rst_info.frame_restoration_type = RESTORE_NONE;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = av1_get_max_filter_level(cpi);
+    const int q = av1_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+// These values were determined by linear fitting the result of the
+// searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_AOM_HIGHBITDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case AOM_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case AOM_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case AOM_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 &&
+               "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+               "or AOM_BITS_12");
+        return;
+    }
+#else
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    if (cm->frame_type == KEY_FRAME) filt_guess -= 4;
+    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+  } else {
+    lf->filter_level =
+        av1_search_filter_level(src, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+                                &cost_restore[RESTORE_NONE]);
+  }
+  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+    cost_restore[r] = search_restore_fun[r](src, cpi, lf->filter_level,
+                                            method == LPF_PICK_FROM_SUBIMAGE,
+                                            &cm->rst_info, tile_cost[r]);
+  }
+  cost_restore[RESTORE_SWITCHABLE] = search_switchable_restoration(
+      cpi, lf->filter_level, method == LPF_PICK_FROM_SUBIMAGE, &cm->rst_info,
+      tile_cost);
+
+  best_cost_restore = DBL_MAX;
+  best_restore = 0;
+  for (r = 0; r < RESTORE_TYPES; ++r) {
+    if (cost_restore[r] < best_cost_restore) {
+      best_restore = r;
+      best_cost_restore = cost_restore[r];
+    }
+  }
+  cm->rst_info.frame_restoration_type = best_restore;
+  /*
+  printf("Frame %d/%d frame_restore_type %d : %f %f %f %f\n",
+         cm->current_video_frame, cm->show_frame,
+         cm->rst_info.frame_restoration_type,
+         cost_restore[0], cost_restore[1], cost_restore[2], cost_restore[3]);
+         */
+  for (r = 0; r < RESTORE_SWITCHABLE_TYPES; r++) aom_free(tile_cost[r]);
+}

diff --git a/av1/encoder/pickrst.h b/av1/encoder/pickrst.h
new file mode 100644
index 0000000..7ddda43
--- /dev/null
+++ b/av1/encoder/pickrst.h

@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_ENCODER_PICKRST_H_
+#define AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+                                 LPF_PICK_METHOD method);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AV1_ENCODER_PICKRST_H_

diff --git a/av1/encoder/pvq_encoder.c b/av1/encoder/pvq_encoder.c
index 0c80762..2d8340d 100644
--- a/av1/encoder/pvq_encoder.c
+++ b/av1/encoder/pvq_encoder.c

@@ -25,7 +25,7 @@
 #include "av1/common/partition.h"
 #include "av1/common/pvq_state.h"
 #include "av1/encoder/encodemb.h"
-#include "pvq_encoder.h"
+#include "av1/encoder/pvq_encoder.h"
 
 #define OD_PVQ_RATE_APPROX (0)
 /*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
@@ -912,7 +912,8 @@
       int j;
       int tmp;
       tmp = 1;
-      for (j = i + 1; j < nb_bands; j += 3) {
+      // ToDo(yaowu): figure out better stop condition without gcc warning.
+      for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
         if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
       }
       skip_dir |= tmp << i;

diff --git a/av1/encoder/quantize.c b/av1/encoder/quantize.c
index f15b0bf..771f94b 100644
--- a/av1/encoder/quantize.c
+++ b/av1/encoder/quantize.c

@@ -11,16 +11,814 @@
 
 #include <math.h>
 #include "./aom_dsp_rtcd.h"
+#include "aom_dsp/quantize.h"
 #include "aom_mem/aom_mem.h"
 #include "aom_ports/mem.h"
 
 #include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
 #include "av1/common/seg_common.h"
 
 #include "av1/encoder/encoder.h"
 #include "av1/encoder/quantize.h"
 #include "av1/encoder/rd.h"
 
+#if CONFIG_NEW_QUANT
+static INLINE int quantize_coeff_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+    q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32);
+    q = NUQ_KNOTS +
+        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+    // (1 + logsizeby32);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        ((((int64_t)tmp -
+           ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32)) *
+          quant) >>
+         (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+    // *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+    // (1 + logsizeby32);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                     int skip_block, const int16_t quant,
+                     const int16_t quant_shift, const int16_t dequant,
+                     const tran_low_t *cuml_bins_ptr,
+                     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                           cuml_bins_ptr, dequant_val, qcoeff_ptr, dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t quant,
+                        const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+                        const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant, cuml_bins_ptr,
+                              dequant_val, qcoeff_ptr, dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t quant,
+                           const int16_t quant_shift, const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                                 cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                 dqcoeff_ptr, 0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                    cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                    dqcoeff_ptr, 0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                    int skip_block, const int16_t *quant_ptr,
+                    const int16_t *quant_shift_ptr, const int16_t *dequant_ptr,
+                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                    const dequant_val_type_nuq *dequant_val,
+                    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                    uint16_t *eob_ptr, const int16_t *scan,
+                    const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
+                             quant_shift_ptr[rc != 0], dequant_ptr[rc != 0],
+                             cuml_bins_ptr[band[i]], dequant_val[band[i]],
+                             &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                       int skip_block, const int16_t *quant_ptr,
+                       const int16_t *dequant_ptr,
+                       const cuml_bins_type_nuq *cuml_bins_ptr,
+                       const dequant_val_type_nuq *dequant_val,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       uint16_t *eob_ptr, const int16_t *scan,
+                       const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_fp_nuq(coeff_ptr[rc], quant_ptr[rc != 0],
+                                dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+                                dequant_val[band[i]], &qcoeff_ptr[rc],
+                                &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          const int16_t *dequant_ptr,
+                          const cuml_bins_type_nuq *cuml_bins_ptr,
+                          const dequant_val_type_nuq *dequant_val,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], 0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *quant_ptr,
+                             const int16_t *dequant_ptr,
+                             const cuml_bins_type_nuq *cuml_bins_ptr,
+                             const dequant_val_type_nuq *dequant_val,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             uint16_t *eob_ptr, const int16_t *scan,
+                             const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], 0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_NEW_QUANT
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  *eob_ptr = 0;
+}
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+                            ,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                            ) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  if (qparam->log_scale == 0) {
+    av1_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                    p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                    pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                    ,
+                    qm_ptr, iqm_ptr
+#endif
+                    );
+  } else {
+    av1_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                          p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                          pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                          ,
+                          qm_ptr, iqm_ptr
+#endif
+                          );
+  }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
+                           const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+                           ,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                           ) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  if (qparam->log_scale == 0) {
+    aom_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
+                   p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
+                   eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                   ,
+                   qm_ptr, iqm_ptr
+#endif
+                   );
+  } else {
+    aom_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                         p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan
+#if CONFIG_AOM_QM
+                         ,
+                         qm_ptr, iqm_ptr
+#endif
+                         );
+  }
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+                            ,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                            ) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  (void)sc;
+  if (qparam->log_scale == 0) {
+    aom_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                    p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+                    eob_ptr
+#if CONFIG_AOM_QM
+                    ,
+                    qm_ptr, iqm_ptr
+#endif
+                    );
+  } else {
+    aom_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                          qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr
+#if CONFIG_AOM_QM
+                          ,
+                          qm_ptr, iqm_ptr
+#endif
+                          );
+  }
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+    ,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+    ) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  av1_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                         p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+                         qm_ptr, iqm_ptr,
+#endif
+                         qparam->log_scale);
+}
+
+void av1_highbd_quantize_b_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+    ,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+    ) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  av1_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                        p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                        pd->dequant, eob_ptr, sc->scan, sc->iscan,
+#if CONFIG_AOM_QM
+                        qm_ptr, iqm_ptr,
+#endif
+                        qparam->log_scale);
+}
+
+void av1_highbd_quantize_dc_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+    ,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+    ) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  (void)sc;
+
+  av1_highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                         p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant[0], eob_ptr,
+#if CONFIG_AOM_QM
+                         qm_ptr, iqm_ptr,
+#endif
+                         qparam->log_scale);
+}
+
+#if CONFIG_NEW_QUANT
+static INLINE int highbd_quantize_coeff_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS + (int)(((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr = av1_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_fp_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        (int)(((tmp - ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1],
+                                         1 + logsizeby32)) *
+               quant) >>
+              (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_nuq(
+    const tran_low_t coeffv, const int16_t quant, const int16_t quant_shift,
+    const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32);
+    q = NUQ_KNOTS + (int)(((((tmp * quant) >> 16) + tmp) * quant_shift) >>
+                          (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr = ROUND_POWER_OF_TWO(
+        av1_dequant_abscoeff_nuq(q, dequant, dequant_val), 1 + logsizeby32);
+    *qcoeff_ptr = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t quant,
+                            const int16_t quant_shift, const int16_t dequant,
+                            const tran_low_t *cuml_bins_ptr,
+                            const tran_low_t *dequant_val,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_nuq(coeff_ptr[rc], quant, quant_shift, dequant,
+                                  cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                  dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               int skip_block, const int16_t quant,
+                               const int16_t dequant,
+                               const tran_low_t *cuml_bins_ptr,
+                               const tran_low_t *dequant_val,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                     cuml_bins_ptr, dequant_val, qcoeff_ptr,
+                                     dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t *quant_ptr,
+                           const int16_t *quant_shift_ptr,
+                           const int16_t *dequant_ptr,
+                           const cuml_bins_type_nuq *cuml_bins_ptr,
+                           const dequant_val_type_nuq *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr, const int16_t *scan,
+                           const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 int skip_block, const int16_t *quant_ptr,
+                                 const int16_t *quant_shift_ptr,
+                                 const int16_t *dequant_ptr,
+                                 const cuml_bins_type_nuq *cuml_bins_ptr,
+                                 const dequant_val_type_nuq *dequant_val,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], quant_shift_ptr[rc != 0],
+              dequant_ptr[rc != 0], cuml_bins_ptr[band[i]],
+              dequant_val[band[i]], &qcoeff_ptr[rc], &dqcoeff_ptr[rc], 0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs, int skip_block,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *dequant_ptr,
+                                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                                    const dequant_val_type_nuq *dequant_val,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                    const int16_t *scan, const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc], 0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *quant_ptr,
+                              const int16_t *dequant_ptr,
+                              const cuml_bins_type_nuq *cuml_bins_ptr,
+                              const dequant_val_type_nuq *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr, const int16_t *scan,
+                              const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_fp_nuq(
+              coeff_ptr[rc], quant_ptr[rc != 0], dequant_ptr[rc != 0],
+              cuml_bins_ptr[band[i]], dequant_val[band[i]], &qcoeff_ptr[rc],
+              &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_nuq(coeff_ptr[rc], quant, quant_shift,
+                                        dequant, cuml_bins_ptr, dequant_val,
+                                        qcoeff_ptr, dqcoeff_ptr, 0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_fp_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc], quant, dequant,
+                                           cuml_bins_ptr, dequant_val,
+                                           qcoeff_ptr, dqcoeff_ptr, 0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
 void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block, const int16_t *zbin_ptr,
                        const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -85,14 +883,15 @@
                               const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan
+                              const int16_t *scan, const int16_t *iscan,
 #if CONFIG_AOM_QM
-                              ,
-                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+                              const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
 #endif
-                              ) {
+                              int log_scale) {
   int i;
   int eob = -1;
+  const int scale = 1 << log_scale;
+  const int shift = 16 - log_scale;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
@@ -120,20 +919,22 @@
       const int64_t tmp = abs_coeff + round_ptr[rc != 0];
 #if CONFIG_AOM_QM
       const uint32_t abs_qcoeff =
-          (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (16 + AOM_QM_BITS));
+          (uint32_t)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / scale;
 #else
-      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
 #endif
       if (abs_qcoeff) eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
-#endif
+
+#endif  // CONFIG_AOM_HIGHBITDEPTH
 
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
@@ -199,30 +1000,55 @@
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
-void av1_highbd_quantize_fp_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan
+void av1_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t *zbin_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
 #if CONFIG_AOM_QM
-    ,
-    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
 #endif
-    ) {
-  int i, eob = -1;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+                             int log_scale) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  int round[2] = { round_ptr[0], round_ptr[1] };
+  int nzbins[2];
+  int scale = 1;
+  int shift = 16;
   (void)iscan;
 
+  if (log_scale > 0) {
+    zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale);
+    zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale);
+    round[0] = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    round[1] = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+    scale = 1 << log_scale;
+    shift = 16 - log_scale;
+  }
+
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      uint32_t abs_qcoeff = 0;
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
 #if CONFIG_AOM_QM
       const qm_val_t wt = qm_ptr[rc];
       const qm_val_t iwt = iqm_ptr[rc];
@@ -230,70 +1056,67 @@
           (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
           AOM_QM_BITS;
 #endif
+      const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 #if CONFIG_AOM_QM
-      if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - 2))) {
+      if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
 #else
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-#endif
-        const int64_t tmp =
-            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-#if CONFIG_AOM_QM
-        abs_qcoeff =
-            (uint32_t)((tmp * wt * quant_ptr[rc != 0]) >> (AOM_QM_BITS + 15));
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / 2;
-#else
-        abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant_ptr[rc != 0]) / 2;
-#endif
-      }
 
-      if (abs_qcoeff) eob = i;
+      if (abs_coeff >= zbins[rc != 0]) {
+#endif
+        const int64_t tmp1 = abs_coeff + round[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+#if CONFIG_AOM_QM
+        const uint32_t abs_qcoeff = (uint32_t)(
+            (tmp2 * wt * quant_shift_ptr[rc != 0]) >> (AOM_QM_BITS + shift));
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = (qcoeff_ptr[rc] * dequant) / scale;
+#else
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> shift);
+        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+#endif  // CONFIG_AOM_QM
+        if (abs_qcoeff) eob = i;
+      }
     }
   }
   *eob_ptr = eob + 1;
 }
 #endif
 
-void av1_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-#if CONFIG_AOM_QM
-  int seg_id = xd->mi[0]->mbmi.segment_id;
-  int is_intra = is_inter_block(&xd->mi[0]->mbmi);
-  const qm_val_t *qmatrix = pd->seg_qmatrix[seg_id][is_intra][0];
-  const qm_val_t *iqmatrix = pd->seg_iqmatrix[seg_id][is_intra][0];
-#endif
-
 #if CONFIG_AOM_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    aom_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block,
-                          p->zbin, p->round, p->quant, p->quant_shift,
-                          BLOCK_OFFSET(p->qcoeff, block),
-                          BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant,
-#if !CONFIG_AOM_QM
-                          &p->eobs[block], scan, iscan);
-#else
-                          &p->eobs[block], scan, iscan, qmatrix, iqmatrix);
+void av1_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr,
+#if CONFIG_AOM_QM
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
 #endif
-    return;
+                            const int log_scale) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+#if CONFIG_AOM_QM
+  (void)qm_ptr;
+  (void)iqm_ptr;
+#endif
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
+    if (abs_qcoeff) eob = 0;
   }
-#endif
-  aom_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin,
-                 p->round, p->quant, p->quant_shift,
-                 BLOCK_OFFSET(p->qcoeff, block),
-                 BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block],
-#if !CONFIG_AOM_QM
-                 scan, iscan);
-#else
-                 scan, iscan, qmatrix, iqmatrix);
-#endif
+  *eob_ptr = eob + 1;
 }
+#endif
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   uint32_t t;
@@ -326,15 +1149,16 @@
   AV1_COMMON *const cm = &cpi->common;
   QUANTS *const quants = &cpi->quants;
   int i, q, quant;
+#if CONFIG_NEW_QUANT
+  int dq;
+#endif
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
-      int qrounding_factor_fp = i == 0 ? 48 : 42;
-      if (q == 0) qrounding_factor_fp = 64;
-
+      int qrounding_factor_fp = 64;
       // y
       quant = i == 0 ? av1_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
                      : av1_ac_quant(q, 0, cm->bit_depth);
@@ -357,7 +1181,20 @@
       cpi->uv_dequant[q][i] = quant;
     }
 
-    for (i = 2; i < 8; i++) {
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq++) {
+      for (i = 0; i < COEF_BANDS; i++) {
+        const int y_quant = cpi->y_dequant[q][i != 0];
+        const int uvquant = cpi->uv_dequant[q][i != 0];
+        av1_get_dequant_val_nuq(y_quant, i, cpi->y_dequant_val_nuq[dq][q][i],
+                                quants->y_cuml_bins_nuq[dq][q][i], dq);
+        av1_get_dequant_val_nuq(uvquant, i, cpi->uv_dequant_val_nuq[dq][q][i],
+                                quants->uv_cuml_bins_nuq[dq][q][i], dq);
+      }
+    }
+#endif  // CONFIG_NEW_QUANT
+
+    for (i = 2; i < 8; i++) {  // 8: SIMD width
       quants->y_quant[q][i] = quants->y_quant[q][1];
       quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
       quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
@@ -377,11 +1214,11 @@
   }
 }
 
-void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x) {
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const QUANTS *const quants = &cpi->quants;
-  const int segment_id = xd->mi[0]->mbmi.segment_id;
 
 #if CONFIG_DELTA_Q
   int current_q_index = cpi->oxcf.aq_mode == DELTA_AQ
@@ -401,6 +1238,9 @@
                     ? NUM_QM_LEVELS - 1
                     : aom_get_qmlevel(cm->base_qindex, minqm, maxqm);
 #endif
+#if CONFIG_NEW_QUANT
+  int dq;
+#endif
 
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
@@ -416,6 +1256,12 @@
          sizeof(cm->giqmatrix[qmlevel][0]));
 #endif
   xd->plane[0].dequant = cpi->y_dequant[qindex];
+#if CONFIG_NEW_QUANT
+  for (dq = 0; dq < QUANT_PROFILES; dq++) {
+    x->plane[0].cuml_bins_nuq[dq] = quants->y_cuml_bins_nuq[dq][qindex];
+    xd->plane[0].dequant_val_nuq[dq] = cpi->y_dequant_val_nuq[dq][qindex];
+  }
+#endif  // CONFIG_NEW_QUANT
 
   x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
   x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
@@ -435,21 +1281,29 @@
            sizeof(cm->giqmatrix[qmlevel][1]));
 #endif
     xd->plane[i].dequant = cpi->uv_dequant[qindex];
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq++) {
+      x->plane[i].cuml_bins_nuq[dq] = quants->uv_cuml_bins_nuq[dq][qindex];
+      xd->plane[i].dequant_val_nuq[dq] = cpi->uv_dequant_val_nuq[dq][qindex];
+    }
+#endif  // CONFIG_NEW_QUANT
 
     x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
     x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
 
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
-  x->q_index = qindex;
+  x->qindex = qindex;
 
   set_error_per_bit(x, rdmult);
 
-  av1_initialize_me_consts(cpi, x, x->q_index);
+  av1_initialize_me_consts(cpi, x, qindex);
 }
 
 void av1_frame_init_quantizer(AV1_COMP *cpi) {
-  av1_init_plane_quantizers(cpi, &cpi->td.mb);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  av1_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
 }
 
 void av1_set_quantizer(AV1_COMMON *cm, int q) {

diff --git a/av1/encoder/quantize.h b/av1/encoder/quantize.h
index 1091c41..f5f045e 100644
--- a/av1/encoder/quantize.h
+++ b/av1/encoder/quantize.h

@@ -14,13 +14,35 @@
 
 #include "./aom_config.h"
 #include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
 #include "av1/encoder/block.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef struct QUANT_PARAM { int log_scale; } QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const MACROBLOCK_PLANE *p,
+                                 tran_low_t *qcoeff_ptr,
+                                 const MACROBLOCKD_PLANE *pd,
+                                 tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                 const SCAN_ORDER *sc, const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+                                 ,
+                                 const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                                 );
+
 typedef struct {
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, tran_low_t, y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE]
+                                                 [COEF_BANDS][NUQ_KNOTS]);
+  DECLARE_ALIGNED(16, tran_low_t, uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE]
+                                                  [COEF_BANDS][NUQ_KNOTS]);
+#endif  // CONFIG_NEW_QUANT
+  // 0: dc 1: ac 2-8: ac repeated to SIMD width
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
@@ -39,15 +61,13 @@
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void av1_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan);
-
 struct AV1_COMP;
 struct AV1Common;
 
 void av1_frame_init_quantizer(struct AV1_COMP *cpi);
 
-void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x);
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+                               int segment_id);
 
 void av1_init_quantizer(struct AV1_COMP *cpi);
 
@@ -57,6 +77,142 @@
 
 int av1_qindex_to_quantizer(int qindex);
 
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+                            ,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                            );
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                           const MACROBLOCKD_PLANE *pd, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr, const SCAN_ORDER *sc,
+                           const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+                           ,
+                           const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                           );
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const SCAN_ORDER *sc, const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+                            ,
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+                            );
+
+#if CONFIG_NEW_QUANT
+void quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                     int skip_block, const int16_t quant,
+                     const int16_t quant_shift, const int16_t dequant,
+                     const tran_low_t *cuml_bins_ptr,
+                     const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           int skip_block, const int16_t quant,
+                           const int16_t quant_shift, const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr);
+void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t quant,
+                        const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+                        const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr);
+#endif  // CONFIG_NEW_QUANT
+
+#if CONFIG_AOM_HIGHBITDEPTH
+void av1_highbd_quantize_fp_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+    ,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+    );
+
+void av1_highbd_quantize_b_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+    ,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+    );
+
+void av1_highbd_quantize_dc_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const SCAN_ORDER *sc,
+    const QUANT_PARAM *qparam
+#if CONFIG_AOM_QM
+    ,
+    const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr
+#endif
+    );
+
+void av1_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            uint16_t *eob_ptr,
+#if CONFIG_AOM_QM
+                            const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+#endif
+                            const int log_scale);
+#if CONFIG_NEW_QUANT
+void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t quant,
+                            const int16_t quant_shift, const int16_t dequant,
+                            const tran_low_t *cuml_bins_ptr,
+                            const tran_low_t *dequant_val,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            uint16_t *eob_ptr);
+void highbd_quantize_dc_32x32_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t quant_shift, const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr, const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               int skip_block, const int16_t quant,
+                               const int16_t dequant,
+                               const tran_low_t *cuml_bins_ptr,
+                               const tran_low_t *dequant_val,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               uint16_t *eob_ptr);
+void highbd_quantize_dc_32x32_fp_nuq(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t quant, const int16_t dequant, const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/ransac.c b/av1/encoder/ransac.c
new file mode 100644
index 0000000..2699c4f
--- /dev/null
+++ b/av1/encoder/ransac.c

@@ -0,0 +1,369 @@
+/*
+ *   (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "av1/encoder/ransac.h"
+
+#define MAX_MINPTS 4
+
+#define MAX_DEGENERATE_ITER 10
+#define MINPTS_MULTIPLIER 5
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef int (*IsDegenerateFunc)(double *p);
+typedef void (*NormalizeFunc)(double *p, int np, double *T);
+typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2);
+typedef int (*FindTransformationFunc)(int points, double *points1,
+                                      double *points2, double *params);
+typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points,
+                                        double *proj, const int n,
+                                        const int stride_points,
+                                        const int stride_proj);
+
+static void project_points_double_translation(double *mat, double *points,
+                                              double *proj, const int n,
+                                              const int stride_points,
+                                              const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = x + mat[1];
+    *(proj++) = y + mat[0];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_rotzoom(double *mat, double *points,
+                                          double *proj, const int n,
+                                          const int stride_points,
+                                          const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[3] * x + mat[2] * y + mat[1];
+    *(proj++) = -mat[2] * x + mat[3] * y + mat[0];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_affine(double *mat, double *points,
+                                         double *proj, const int n,
+                                         const int stride_points,
+                                         const int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[3] * x + mat[2] * y + mat[1];
+    *(proj++) = mat[4] * x + mat[5] * y + mat[0];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void project_points_double_homography(double *mat, double *points,
+                                             double *proj, const int n,
+                                             const int stride_points,
+                                             const int stride_proj) {
+  int i;
+  double x, y, Z, Z_inv;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    Z_inv = mat[7] * x + mat[6] * y + 1;
+    assert(fabs(Z_inv) > 0.00001);
+    Z = 1. / Z_inv;
+    *(proj++) = (mat[1] * x + mat[0] * y + mat[3]) * Z;
+    *(proj++) = (mat[2] * x + mat[4] * y + mat[4]) * Z;
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static int get_rand_indices(int npoints, int minpts, int *indices,
+                            unsigned int *seed) {
+  int i, j;
+  int ptr = rand_r(seed) % npoints;
+  if (minpts > npoints) return 0;
+  indices[0] = ptr;
+  ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+  i = 1;
+  while (i < minpts) {
+    int index = rand_r(seed) % npoints;
+    while (index) {
+      ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+      for (j = 0; j < i; ++j) {
+        if (indices[j] == ptr) break;
+      }
+      if (j == i) index--;
+    }
+    indices[i++] = ptr;
+  }
+  return 1;
+}
+
+static int ransac(double *matched_points, int npoints, int *number_of_inliers,
+                  int *best_inlier_mask, double *best_params, const int minpts,
+                  const int paramdim, IsDegenerateFunc is_degenerate,
+                  NormalizeFunc normalize, DenormalizeFunc denormalize,
+                  FindTransformationFunc find_transformation,
+                  ProjectPointsDoubleFunc projectpoints) {
+  static const double INLIER_THRESHOLD_NORMALIZED = 0.1;
+  static const double INLIER_THRESHOLD_UNNORMALIZED = 1.0;
+  static const double PROBABILITY_REQUIRED = 0.9;
+  static const double EPS = 1e-12;
+  static const int MIN_TRIALS = 20;
+
+  const double inlier_threshold =
+      (normalize && denormalize ? INLIER_THRESHOLD_NORMALIZED
+                                : INLIER_THRESHOLD_UNNORMALIZED);
+  int N = 10000, trial_count = 0;
+  int i;
+  int ret_val = 0;
+  unsigned int seed = (unsigned int)npoints;
+
+  int max_inliers = 0;
+  double best_variance = 0.0;
+  double params[MAX_PARAMDIM];
+  WarpedMotionParams wm;
+  double points1[2 * MAX_MINPTS];
+  double points2[2 * MAX_MINPTS];
+  int indices[MAX_MINPTS] = { 0 };
+
+  double *best_inlier_set1;
+  double *best_inlier_set2;
+  double *inlier_set1;
+  double *inlier_set2;
+  double *corners1;
+  double *corners2;
+  double *image1_coord;
+  int *inlier_mask;
+
+  double *cnp1, *cnp2;
+  double T1[9], T2[9];
+
+  *number_of_inliers = 0;
+  if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+    printf("Cannot find motion with %d matches\n", npoints);
+    return 1;
+  }
+
+  memset(&wm, 0, sizeof(wm));
+  best_inlier_set1 =
+      (double *)aom_malloc(sizeof(*best_inlier_set1) * npoints * 2);
+  best_inlier_set2 =
+      (double *)aom_malloc(sizeof(*best_inlier_set2) * npoints * 2);
+  inlier_set1 = (double *)aom_malloc(sizeof(*inlier_set1) * npoints * 2);
+  inlier_set2 = (double *)aom_malloc(sizeof(*inlier_set2) * npoints * 2);
+  corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+  corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+  image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2);
+  inlier_mask = (int *)aom_malloc(sizeof(*inlier_mask) * npoints);
+
+  if (!(best_inlier_set1 && best_inlier_set2 && inlier_set1 && inlier_set2 &&
+        corners1 && corners2 && image1_coord && inlier_mask)) {
+    ret_val = 1;
+    goto finish_ransac;
+  }
+
+  for (cnp1 = corners1, cnp2 = corners2, i = 0; i < npoints; ++i) {
+    *(cnp1++) = *(matched_points++);
+    *(cnp1++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+    *(cnp2++) = *(matched_points++);
+  }
+  matched_points -= 4 * npoints;
+
+  if (normalize && denormalize) {
+    normalize(corners1, npoints, T1);
+    normalize(corners2, npoints, T2);
+  }
+
+  while (N > trial_count) {
+    int num_inliers = 0;
+    double sum_distance = 0.0;
+    double sum_distance_squared = 0.0;
+
+    int degenerate = 1;
+    int num_degenerate_iter = 0;
+    while (degenerate) {
+      num_degenerate_iter++;
+      if (!get_rand_indices(npoints, minpts, indices, &seed)) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+      i = 0;
+      while (i < minpts) {
+        int index = indices[i];
+        // add to list
+        points1[i * 2] = corners1[index * 2];
+        points1[i * 2 + 1] = corners1[index * 2 + 1];
+        points2[i * 2] = corners2[index * 2];
+        points2[i * 2 + 1] = corners2[index * 2 + 1];
+        i++;
+      }
+      degenerate = is_degenerate(points1);
+      if (num_degenerate_iter > MAX_DEGENERATE_ITER) {
+        ret_val = 1;
+        goto finish_ransac;
+      }
+    }
+
+    if (find_transformation(minpts, points1, points2, params)) {
+      trial_count++;
+      continue;
+    }
+
+    projectpoints(params, corners1, image1_coord, npoints, 2, 2);
+
+    for (i = 0; i < npoints; ++i) {
+      double dx = image1_coord[i * 2] - corners2[i * 2];
+      double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1];
+      double distance = sqrt(dx * dx + dy * dy);
+
+      inlier_mask[i] = distance < inlier_threshold;
+      if (inlier_mask[i]) {
+        inlier_set1[num_inliers * 2] = corners1[i * 2];
+        inlier_set1[num_inliers * 2 + 1] = corners1[i * 2 + 1];
+        inlier_set2[num_inliers * 2] = corners2[i * 2];
+        inlier_set2[num_inliers * 2 + 1] = corners2[i * 2 + 1];
+        num_inliers++;
+        sum_distance += distance;
+        sum_distance_squared += distance * distance;
+      }
+    }
+
+    if (num_inliers >= max_inliers && num_inliers > 1) {
+      int temp;
+      double fracinliers, pNoOutliers, mean_distance, variance;
+
+      assert(num_inliers > 1);
+      mean_distance = sum_distance / ((double)num_inliers);
+      variance = sum_distance_squared / ((double)num_inliers - 1.0) -
+                 mean_distance * mean_distance * ((double)num_inliers) /
+                     ((double)num_inliers - 1.0);
+      if ((num_inliers > max_inliers) ||
+          (num_inliers == max_inliers && variance < best_variance)) {
+        best_variance = variance;
+        max_inliers = num_inliers;
+        memcpy(best_params, params, paramdim * sizeof(*best_params));
+        memcpy(best_inlier_set1, inlier_set1,
+               num_inliers * 2 * sizeof(*best_inlier_set1));
+        memcpy(best_inlier_set2, inlier_set2,
+               num_inliers * 2 * sizeof(*best_inlier_set2));
+        memcpy(best_inlier_mask, inlier_mask,
+               npoints * sizeof(*best_inlier_mask));
+
+        assert(npoints > 0);
+        fracinliers = (double)num_inliers / (double)npoints;
+        pNoOutliers = 1 - pow(fracinliers, minpts);
+        pNoOutliers = fmax(EPS, pNoOutliers);
+        pNoOutliers = fmin(1 - EPS, pNoOutliers);
+        assert(fabs(1.0 - pNoOutliers) > 0.00001);
+        temp = (int)(log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers));
+        if (temp > 0 && temp < N) {
+          N = AOMMAX(temp, MIN_TRIALS);
+        }
+      }
+    }
+    trial_count++;
+  }
+  find_transformation(max_inliers, best_inlier_set1, best_inlier_set2,
+                      best_params);
+  if (normalize && denormalize) {
+    denormalize(best_params, T1, T2);
+  }
+  *number_of_inliers = max_inliers;
+finish_ransac:
+  aom_free(best_inlier_set1);
+  aom_free(best_inlier_set2);
+  aom_free(inlier_set1);
+  aom_free(inlier_set2);
+  aom_free(corners1);
+  aom_free(corners2);
+  aom_free(image1_coord);
+  aom_free(inlier_mask);
+  return ret_val;
+}
+
+static int is_collinear3(double *p1, double *p2, double *p3) {
+  static const double collinear_eps = 1e-3;
+  const double v =
+      (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+  return fabs(v) < collinear_eps;
+}
+
+static int is_degenerate_translation(double *p) {
+  return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+
+static int is_degenerate_affine(double *p) {
+  return is_collinear3(p, p + 2, p + 4);
+}
+
+static int is_degenerate_homography(double *p) {
+  return is_collinear3(p, p + 2, p + 4) || is_collinear3(p, p + 2, p + 6) ||
+         is_collinear3(p, p + 4, p + 6) || is_collinear3(p + 2, p + 4, p + 6);
+}
+
+int ransac_translation(double *matched_points, int npoints,
+                       int *number_of_inliers, int *best_inlier_mask,
+                       double *best_params) {
+  return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
+                best_params, 3, 2, is_degenerate_translation,
+                NULL,  // normalize_homography,
+                NULL,  // denormalize_rotzoom,
+                find_translation, project_points_double_translation);
+}
+
+int ransac_rotzoom(double *matched_points, int npoints, int *number_of_inliers,
+                   int *best_inlier_mask, double *best_params) {
+  return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
+                best_params, 3, 4, is_degenerate_affine,
+                NULL,  // normalize_homography,
+                NULL,  // denormalize_rotzoom,
+                find_rotzoom, project_points_double_rotzoom);
+}
+
+int ransac_affine(double *matched_points, int npoints, int *number_of_inliers,
+                  int *best_inlier_mask, double *best_params) {
+  return ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
+                best_params, 3, 6, is_degenerate_affine,
+                NULL,  // normalize_homography,
+                NULL,  // denormalize_affine,
+                find_affine, project_points_double_affine);
+}
+
+int ransac_homography(double *matched_points, int npoints,
+                      int *number_of_inliers, int *best_inlier_mask,
+                      double *best_params) {
+  const int result =
+      ransac(matched_points, npoints, number_of_inliers, best_inlier_mask,
+             best_params, 4, 8, is_degenerate_homography,
+             NULL,  // normalize_homography,
+             NULL,  // denormalize_homography,
+             find_homography, project_points_double_homography);
+  if (!result) {
+    // normalize so that H33 = 1
+    int i;
+    const double m = 1.0 / best_params[8];
+    assert(fabs(best_params[8]) > 0.00001);
+    for (i = 0; i < 8; ++i) best_params[i] *= m;
+    best_params[8] = 1.0;
+  }
+  return result;
+}

diff --git a/av1/encoder/ransac.h b/av1/encoder/ransac.h
new file mode 100644
index 0000000..f75d70f
--- /dev/null
+++ b/av1/encoder/ransac.h

@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_ENCODER_RANSAC_H_
+#define AV1_ENCODER_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+
+#include "av1/common/warped_motion.h"
+
+typedef int (*RansacFunc)(double *matched_points, int npoints,
+                          int *number_of_inliers, int *best_inlier_mask,
+                          double *best_params);
+
+/* Each of these functions fits a motion model from a set of
+corresponding points in 2 frames using RANSAC.*/
+int ransac_homography(double *matched_points, int npoints,
+                      int *number_of_inliers, int *best_inlier_indices,
+                      double *best_params);
+int ransac_affine(double *matched_points, int npoints, int *number_of_inliers,
+                  int *best_inlier_indices, double *best_params);
+int ransac_rotzoom(double *matched_points, int npoints, int *number_of_inliers,
+                   int *best_inlier_indices, double *best_params);
+int ransac_translation(double *matched_points, int npoints,
+                       int *number_of_inliers, int *best_inlier_indices,
+                       double *best_params);
+#endif  // AV1_ENCODER_RANSAC_H_

diff --git a/av1/encoder/ratectrl.c b/av1/encoder/ratectrl.c
index e05dff9..2141f30 100644
--- a/av1/encoder/ratectrl.c
+++ b/av1/encoder/ratectrl.c

@@ -39,13 +39,10 @@
 #define DEFAULT_KF_BOOST 2000
 #define DEFAULT_GF_BOOST 2000
 
-#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
-
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
 #define FRAME_OVERHEAD_BITS 200
-
 #if CONFIG_AOM_HIGHBITDEPTH
 #define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
   do {                                                       \
@@ -191,14 +188,21 @@
   const AV1EncoderConfig *oxcf = &cpi->oxcf;
   const int min_frame_target =
       AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
-  if (target < min_frame_target) target = min_frame_target;
+// Clip the frame target to the minimum setup value.
+#if CONFIG_EXT_REFS
+  if (cpi->rc.is_src_frame_alt_ref) {
+#else
   if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+#endif  // CONFIG_EXT_REFS
     // If there is an active ARF at this location use the minimum
     // bits on this frame even if it is a constructed arf.
     // The active maximum quantizer insures that an appropriate
     // number of bits will be spent if needed for constructed ARFs.
     target = min_frame_target;
+  } else if (target < min_frame_target) {
+    target = min_frame_target;
   }
+
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
   if (oxcf->rc_max_inter_bitrate_pct) {
@@ -206,6 +210,7 @@
         rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
     target = AOMMIN(target, max_rate);
   }
+
   return target;
 }
 
@@ -710,7 +715,6 @@
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
 
-#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   // Limit Q range for the adaptive loop.
   if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
       !(cm->current_video_frame == 0)) {
@@ -719,9 +723,8 @@
     qdelta = av1_compute_qdelta_by_rate(
         &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
     *top_index = active_worst_quality + qdelta;
-    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+    *top_index = AOMMAX(*top_index, *bottom_index);
   }
-#endif
 
   // Special case code to try and match quality with forced key frames
   if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
@@ -737,6 +740,7 @@
         q = *top_index;
     }
   }
+
   assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
   assert(*bottom_index <= rc->worst_quality &&
          *bottom_index >= rc->best_quality);
@@ -862,12 +866,10 @@
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
 
-#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  // Limit Q range for the adaptive loop.
   {
     int qdelta = 0;
     aom_clear_system_state();
-
-    // Limit Q range for the adaptive loop.
     if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
         !(cm->current_video_frame == 0)) {
       qdelta = av1_compute_qdelta_by_rate(
@@ -878,9 +880,8 @@
           &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
     }
     *top_index = active_worst_quality + qdelta;
-    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
+    *top_index = AOMMAX(*top_index, *bottom_index);
   }
-#endif
 
   if (oxcf->rc_mode == AOM_Q) {
     q = active_best_quality;
@@ -911,30 +912,22 @@
     1.00,  // INTER_NORMAL
 #if CONFIG_EXT_REFS
     0.80,  // INTER_LOW
-    1.25,  // INTER_HIGH
+    1.50,  // INTER_HIGH
+    1.25,  // GF_ARF_LOW
 #else
     1.00,  // INTER_HIGH
-#endif     // CONFIG_EXT_REFS
     1.50,  // GF_ARF_LOW
-    1.75,  // GF_ARF_STD
+#endif     // CONFIG_EXT_REFS
+    2.00,  // GF_ARF_STD
     2.00,  // KF_STD
   };
-  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
 #if CONFIG_EXT_REFS
-    INTER_FRAME,
-    INTER_FRAME,
-    INTER_FRAME,
-    INTER_FRAME,
-    INTER_FRAME,
-    KEY_FRAME
+      { INTER_FRAME, INTER_FRAME, INTER_FRAME,
+        INTER_FRAME, INTER_FRAME, KEY_FRAME };
 #else
-    INTER_FRAME,
-    INTER_FRAME,
-    INTER_FRAME,
-    INTER_FRAME,
-    KEY_FRAME
+      { INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME };
 #endif  // CONFIG_EXT_REFS
-  };
   const AV1_COMMON *const cm = &cpi->common;
   int qdelta =
       av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
@@ -984,6 +977,7 @@
       // Not forced keyframe.
       double q_adj_factor = 1.0;
       double q_val;
+
       // Baseline value derived from cpi->active_worst_quality and kf boost.
       active_best_quality =
           get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
@@ -1067,7 +1061,6 @@
     }
   }
 
-#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   aom_clear_system_state();
   // Static forced key frames Q restrictions dealt with elsewhere.
   if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
@@ -1077,7 +1070,6 @@
     active_worst_quality =
         AOMMAX(active_worst_quality + qdelta, active_best_quality);
   }
-#endif
 
   // Modify active_best_quality for downscaled normal frames.
   if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
@@ -1188,10 +1180,23 @@
 static void update_golden_frame_stats(AV1_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
 
+#if CONFIG_EXT_REFS
+  // Update the Golden frame usage counts.
+  // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
+  //                   only the virtual indices for the reference frame will be
+  //                   updated and cpi->refresh_golden_frame will still be zero.
+  if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+#else
   // Update the Golden frame usage counts.
   if (cpi->refresh_golden_frame) {
-    // this frame refreshes means next frames don't unless specified by user
-    rc->frames_since_golden = 0;
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+    // We will not use internal overlay frames to replace the golden frame
+    if (!rc->is_src_frame_ext_arf)
+#endif  // CONFIG_EXT_REFS
+      // this frame refreshes means next frames don't unless specified by user
+      rc->frames_since_golden = 0;
 
     // If we are not using alt ref in the up and coming group clear the arf
     // active flag. In multi arf group case, if the index is not 0 then
@@ -1251,7 +1256,7 @@
     }
   }
 
-  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // Keep record of last boosted (KF/GF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
   // better than that already stored.
@@ -1299,6 +1304,7 @@
     update_golden_frame_stats(cpi);
 
   if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+
 #if CONFIG_EXT_REFS
   if (cm->show_frame || rc->is_bwd_ref_frame) {
 #else
@@ -1618,7 +1624,7 @@
   // How far through the clip are we.
   // This number is used to damp the per frame rate correction.
   // Range 0 - 1.0
-  if (cpi->twopass.total_stats.count) {
+  if (cpi->twopass.total_stats.count != 0.) {
     position_factor = sqrt((double)cpi->common.current_video_frame /
                            cpi->twopass.total_stats.count);
   }

diff --git a/av1/encoder/ratectrl.h b/av1/encoder/ratectrl.h
index 313c700..93a9b49 100644
--- a/av1/encoder/ratectrl.h
+++ b/av1/encoder/ratectrl.h

@@ -34,9 +34,9 @@
   INTER_LOW = 1,
   INTER_HIGH = 2,
   GF_ARF_LOW = 3,
-  GF_ARF_STD = 5,
-  KF_STD = 6,
-  RATE_FACTOR_LEVELS = 7
+  GF_ARF_STD = 4,
+  KF_STD = 5,
+  RATE_FACTOR_LEVELS = 6
 } RATE_FACTOR_LEVEL;
 #else
 typedef enum {
@@ -111,6 +111,7 @@
   int is_bwd_ref_frame;
   int is_last_bipred_frame;
   int is_bipred_frame;
+  int is_src_frame_ext_arf;
 #endif  // CONFIG_EXT_REFS
 
   int avg_frame_bandwidth;  // Average frame size target for clip

diff --git a/av1/encoder/rd.c b/av1/encoder/rd.c
index f2698b2..fe6c720 100644
--- a/av1/encoder/rd.c
+++ b/av1/encoder/rd.c

@@ -63,7 +63,10 @@
 // This table is used to correct for block size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
-  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
+  2,  3,  3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
+#if CONFIG_EXT_PARTITION
+  48, 48, 64
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static void fill_mode_costs(AV1_COMP *cpi) {
@@ -75,7 +78,10 @@
       av1_cost_tokens(cpi->y_mode_costs[i][j], av1_kf_y_mode_prob[i][j],
                       av1_intra_mode_tree);
 
-  av1_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], av1_intra_mode_tree);
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    av1_cost_tokens(cpi->mbmode_cost[i], fc->y_mode_prob[i],
+                    av1_intra_mode_tree);
+
   for (i = 0; i < INTRA_MODES; ++i)
     av1_cost_tokens(cpi->intra_uv_mode_cost[i], fc->uv_mode_prob[i],
                     av1_intra_mode_tree);
@@ -104,6 +110,30 @@
   }
 #endif  // CONFIG_PALETTE
 
+  for (i = 0; i < MAX_TX_DEPTH; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      av1_cost_tokens(cpi->tx_size_cost[i][j], fc->tx_size_probs[i][j],
+                      av1_tx_size_tree[i]);
+
+#if CONFIG_EXT_TX
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        av1_cost_tokens(cpi->inter_tx_type_costs[s][i],
+                        fc->inter_ext_tx_prob[s][i], av1_ext_tx_inter_tree[s]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j)
+          av1_cost_tokens(cpi->intra_tx_type_costs[s][i][j],
+                          fc->intra_ext_tx_prob[s][i][j],
+                          av1_ext_tx_intra_tree[s]);
+      }
+    }
+  }
+#else
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     for (j = 0; j < TX_TYPES; ++j)
       av1_cost_tokens(cpi->intra_tx_type_costs[i][j],
@@ -113,10 +143,20 @@
     av1_cost_tokens(cpi->inter_tx_type_costs[i], fc->inter_ext_tx_prob[i],
                     av1_ext_tx_tree);
   }
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    av1_cost_tokens(cpi->intra_filter_cost[i], fc->intra_filter_probs[i],
+                    av1_intra_filter_tree);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_LOOP_RESTORATION
+  av1_cost_tokens(cpi->switchable_restore_cost, fc->switchable_restore_prob,
+                  av1_switchable_restore_tree);
+#endif  // CONFIG_LOOP_RESTORATION
 }
 
-static void fill_token_costs(av1_coeff_cost *c,
-                             av1_coeff_probs_model (*p)[PLANE_TYPES]) {
+void av1_fill_token_costs(av1_coeff_cost *c,
+                          av1_coeff_probs_model (*p)[PLANE_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
   for (t = TX_4X4; t <= TX_32X32; ++t)
@@ -176,6 +216,10 @@
 #if CONFIG_EXT_REFS
   // TODO(zoeliu): To adjust further following factor values.
   128, 128, 128
+  // TODO(weitinglin): We should investigate if the values should be the same
+  //                   as the value used by OVERLAY frame
+  ,
+  144
 #endif  // CONFIG_EXT_REFS
 };
 
@@ -312,20 +356,10 @@
 
   set_block_thresholds(cm, rd);
 
-  fill_token_costs(x->token_costs, cm->fc->coef_probs);
-
-  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
-      cm->frame_type == KEY_FRAME) {
-    for (i = 0; i < PARTITION_CONTEXTS; ++i)
-      av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
-                      av1_partition_tree);
-  }
-
-  fill_mode_costs(cpi);
-
   if (!frame_is_intra_only(cm)) {
 #if CONFIG_REF_MV
     int nmv_ctx;
+
     for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
       av1_build_nmv_cost_table(
           x->nmv_vec_cost[nmv_ctx],
@@ -343,36 +377,74 @@
         cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
         cm->allow_high_precision_mv);
 #endif
+  }
+  if (cpi->oxcf.pass != 1) {
+    av1_fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
-#if CONFIG_REF_MV
-    for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
-      cpi->newmv_mode_cost[i][0] = av1_cost_bit(cm->fc->newmv_prob[i], 0);
-      cpi->newmv_mode_cost[i][1] = av1_cost_bit(cm->fc->newmv_prob[i], 1);
-    }
-
-    for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
-      cpi->zeromv_mode_cost[i][0] = av1_cost_bit(cm->fc->zeromv_prob[i], 0);
-      cpi->zeromv_mode_cost[i][1] = av1_cost_bit(cm->fc->zeromv_prob[i], 1);
-    }
-
-    for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
-      cpi->refmv_mode_cost[i][0] = av1_cost_bit(cm->fc->refmv_prob[i], 0);
-      cpi->refmv_mode_cost[i][1] = av1_cost_bit(cm->fc->refmv_prob[i], 1);
-    }
-    for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
-      cpi->drl_mode_cost[i][0] = av1_cost_bit(cm->fc->drl_prob[i], 0);
-      cpi->drl_mode_cost[i][1] = av1_cost_bit(cm->fc->drl_prob[i], 1);
-    }
+    if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+        cm->frame_type == KEY_FRAME) {
+#if CONFIG_EXT_PARTITION_TYPES
+      av1_cost_tokens(cpi->partition_cost[0], cm->fc->partition_prob[0],
+                      av1_partition_tree);
+      for (i = 1; i < PARTITION_CONTEXTS; ++i)
+        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                        av1_ext_partition_tree);
 #else
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      av1_cost_tokens((int *)cpi->inter_mode_cost[i],
-                      cm->fc->inter_mode_probs[i], av1_inter_mode_tree);
-#endif
-#if CONFIG_MOTION_VAR
-    for (i = 0; i < BLOCK_SIZES; i++)
-      av1_cost_tokens((int *)cpi->motion_mode_cost[i],
-                      cm->fc->motion_mode_prob[i], av1_motion_mode_tree);
-#endif  // CONFIG_MOTION_VAR
+      for (i = 0; i < PARTITION_CONTEXTS; ++i)
+        av1_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                        av1_partition_tree);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    }
+
+    fill_mode_costs(cpi);
+
+    if (!frame_is_intra_only(cm)) {
+#if CONFIG_REF_MV
+      for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+        cpi->newmv_mode_cost[i][0] = av1_cost_bit(cm->fc->newmv_prob[i], 0);
+        cpi->newmv_mode_cost[i][1] = av1_cost_bit(cm->fc->newmv_prob[i], 1);
+      }
+
+      for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
+        cpi->zeromv_mode_cost[i][0] = av1_cost_bit(cm->fc->zeromv_prob[i], 0);
+        cpi->zeromv_mode_cost[i][1] = av1_cost_bit(cm->fc->zeromv_prob[i], 1);
+      }
+
+      for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+        cpi->refmv_mode_cost[i][0] = av1_cost_bit(cm->fc->refmv_prob[i], 0);
+        cpi->refmv_mode_cost[i][1] = av1_cost_bit(cm->fc->refmv_prob[i], 1);
+      }
+
+      for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+        cpi->drl_mode_cost0[i][0] = av1_cost_bit(cm->fc->drl_prob[i], 0);
+        cpi->drl_mode_cost0[i][1] = av1_cost_bit(cm->fc->drl_prob[i], 1);
+      }
+#if CONFIG_EXT_INTER
+      cpi->new2mv_mode_cost[0] = av1_cost_bit(cm->fc->new2mv_prob, 0);
+      cpi->new2mv_mode_cost[1] = av1_cost_bit(cm->fc->new2mv_prob, 1);
+#endif  // CONFIG_EXT_INTER
+#else
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        av1_cost_tokens((int *)cpi->inter_mode_cost[i],
+                        cm->fc->inter_mode_probs[i], av1_inter_mode_tree);
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        av1_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
+                        cm->fc->inter_compound_mode_probs[i],
+                        av1_inter_compound_mode_tree);
+      for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+        av1_cost_tokens((int *)cpi->interintra_mode_cost[i],
+                        cm->fc->interintra_mode_prob[i],
+                        av1_interintra_mode_tree);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+        av1_cost_tokens((int *)cpi->motion_mode_cost[i],
+                        cm->fc->motion_mode_prob[i], av1_motion_mode_tree);
+      }
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+    }
   }
 }
 
@@ -442,7 +514,7 @@
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-void av1_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -466,11 +538,10 @@
   }
 }
 
-void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
-                              const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[16],
-                              ENTROPY_CONTEXT t_left[16]) {
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+static void get_entropy_contexts_plane(
+    BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd,
+    ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+    ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const ENTROPY_CONTEXT *const above = pd->above_context;
@@ -500,10 +571,52 @@
       for (i = 0; i < num_4x4_h; i += 8)
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
+    case TX_4X8:
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_8X4:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      break;
+    case TX_8X16:
+      for (i = 0; i < num_4x4_w; i += 2)
+        t_above[i] = !!*(const uint16_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
+    case TX_16X8:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 2)
+        t_left[i] = !!*(const uint16_t *)&left[i];
+      break;
+    case TX_16X32:
+      for (i = 0; i < num_4x4_w; i += 4)
+        t_above[i] = !!*(const uint32_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 8)
+        t_left[i] = !!*(const uint64_t *)&left[i];
+      break;
+    case TX_32X16:
+      for (i = 0; i < num_4x4_w; i += 8)
+        t_above[i] = !!*(const uint64_t *)&above[i];
+      for (i = 0; i < num_4x4_h; i += 4)
+        t_left[i] = !!*(const uint32_t *)&left[i];
+      break;
     default: assert(0 && "Invalid transform size."); break;
   }
 }
 
+void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
+}
+
 void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
   int i;
@@ -571,9 +684,11 @@
   dst[1].stride = dst[2].stride = src->uv_stride;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
-                     i ? scale_uv : scale, xd->plane[i].subsampling_x,
-                     xd->plane[i].subsampling_y);
+    setup_pred_plane(dst + i, dst[i].buf,
+                     i ? src->uv_crop_width : src->y_crop_width,
+                     i ? src->uv_crop_height : src->y_crop_height,
+                     dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+                     xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
   }
 }
 
@@ -601,11 +716,34 @@
              : NULL;
 }
 
+#if CONFIG_DUAL_FILTER
+int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
+  const AV1_COMMON *const cm = &cpi->common;
+  if (cm->interp_filter == SWITCHABLE) {
+    const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+    int inter_filter_cost = 0;
+    int dir;
+
+    for (dir = 0; dir < 2; ++dir) {
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+        const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+        inter_filter_cost +=
+            cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+      }
+    }
+    return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+  } else {
+    return 0;
+  }
+}
+#else
 int av1_get_switchable_rate(const AV1_COMP *cpi, const MACROBLOCKD *xd) {
   const AV1_COMMON *const cm = &cpi->common;
   if (cm->interp_filter == SWITCHABLE) {
 #if CONFIG_EXT_INTERP
-    if (is_interp_needed(xd))
+    if (av1_is_interp_needed(xd))
 #endif
     {
       const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -616,6 +754,7 @@
   }
   return 0;
 }
+#endif
 
 void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
   int i;
@@ -633,8 +772,8 @@
     rd->thresh_mult[THR_NEARESTL3] = 300;
     rd->thresh_mult[THR_NEARESTB] = 300;
 #endif  // CONFIG_EXT_REFS
-    rd->thresh_mult[THR_NEARESTG] = 300;
     rd->thresh_mult[THR_NEARESTA] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
   } else {
     rd->thresh_mult[THR_NEARESTMV] = 0;
 #if CONFIG_EXT_REFS
@@ -642,8 +781,8 @@
     rd->thresh_mult[THR_NEARESTL3] = 0;
     rd->thresh_mult[THR_NEARESTB] = 0;
 #endif  // CONFIG_EXT_REFS
-    rd->thresh_mult[THR_NEARESTG] = 0;
     rd->thresh_mult[THR_NEARESTA] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
   }
 
   rd->thresh_mult[THR_DC] += 1000;
@@ -666,6 +805,17 @@
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_NEARG] += 1000;
 
+#if CONFIG_EXT_INTER
+  rd->thresh_mult[THR_NEWFROMNEARMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWFROMNEARL2] += 1000;
+  rd->thresh_mult[THR_NEWFROMNEARL3] += 1000;
+  rd->thresh_mult[THR_NEWFROMNEARB] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWFROMNEARA] += 1000;
+  rd->thresh_mult[THR_NEWFROMNEARG] += 1000;
+#endif  // CONFIG_EXT_INTER
+
   rd->thresh_mult[THR_ZEROMV] += 2000;
 #if CONFIG_EXT_REFS
   rd->thresh_mult[THR_ZEROL2] += 2000;
@@ -677,6 +827,23 @@
 
   rd->thresh_mult[THR_TM] += 1000;
 
+#if CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
 #if CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000;
@@ -690,6 +857,96 @@
   rd->thresh_mult[THR_COMP_NEARESTGB] += 1000;
 #endif  // CONFIG_EXT_REFS
 
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
 #if CONFIG_EXT_REFS
@@ -726,14 +983,52 @@
   rd->thresh_mult[THR_COMP_ZEROGB] += 2500;
 #endif  // CONFIG_EXT_REFS
 
+#endif  // CONFIG_EXT_INTER
+
   rd->thresh_mult[THR_H_PRED] += 2000;
   rd->thresh_mult[THR_V_PRED] += 2000;
-  rd->thresh_mult[THR_D45_PRED] += 2500;
   rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D117_PRED] += 2500;
-  rd->thresh_mult[THR_D153_PRED] += 2500;
   rd->thresh_mult[THR_D207_PRED] += 2500;
+  rd->thresh_mult[THR_D153_PRED] += 2500;
   rd->thresh_mult[THR_D63_PRED] += 2500;
+  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D45_PRED] += 2500;
+
+#if CONFIG_EXT_INTER
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL] += 2000;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2] += 2000;
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3] += 2000;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARG] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWG] += 2000;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARB] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWB] += 2000;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA] += 2000;
+#endif  // CONFIG_EXT_INTER
 }
 
 void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
@@ -753,14 +1048,15 @@
   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
 }
 
-void av1_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*factor_buf)[MAX_MODES], int rd_thresh,
                                int bsize, int best_mode_index) {
   if (rd_thresh > 0) {
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
       const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = AOMMIN(bsize + 2, BLOCK_64X64);
+      const BLOCK_SIZE max_size = AOMMIN(bsize + 2, (int)cm->sb_size);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
         int *const fact = &factor_buf[bs][mode];

diff --git a/av1/encoder/rd.h b/av1/encoder/rd.h
index 02f6b4f..c9d21a8 100644
--- a/av1/encoder/rd.h
+++ b/av1/encoder/rd.h

@@ -14,6 +14,9 @@
 
 #include <limits.h>
 
+#if CONFIG_ANS
+#include "aom_dsp/ans.h"
+#endif  // CONFIG_ANS
 #include "av1/common/blockd.h"
 
 #include "av1/encoder/block.h"
@@ -29,6 +32,11 @@
 
 #define RDCOST(RM, DM, R, D) \
   (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), AV1_PROB_COST_SHIFT) + (D << DM))
+
+#define RDCOST_DBL(RM, DM, R, D)                                   \
+  (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+   ((double)(D) * (1 << (DM))))
+
 #define QIDX_SKIP_THRESH 115
 
 #define MV_COST_WEIGHT 108
@@ -37,10 +45,26 @@
 #define INVALID_MV 0x80008000
 
 #if CONFIG_EXT_REFS
+
+#if CONFIG_EXT_INTER
+#define MAX_MODES 144
+#else  // CONFIG_EXT_INTER
 #define MAX_MODES 66
+#endif  // CONFIG_EXT_INTER
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_INTER
+#define MAX_MODES 57
+#else  // CONFIG_EXT_INTER
+#define MAX_MODES 30
+#endif  // CONFIG_EXT_INTER
+
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
 #define MAX_REFS 15
 #else
-#define MAX_MODES 30
 #define MAX_REFS 6
 #endif  // CONFIG_EXT_REFS
 
@@ -79,6 +103,17 @@
   THR_NEARA,
   THR_NEARG,
 
+#if CONFIG_EXT_INTER
+  THR_NEWFROMNEARMV,
+#if CONFIG_EXT_REFS
+  THR_NEWFROMNEARL2,
+  THR_NEWFROMNEARL3,
+  THR_NEWFROMNEARB,
+#endif  // CONFIG_EXT_REFS
+  THR_NEWFROMNEARA,
+  THR_NEWFROMNEARG,
+#endif  // CONFIG_EXT_INTER
+
   THR_ZEROMV,
 #if CONFIG_EXT_REFS
   THR_ZEROL2,
@@ -88,6 +123,23 @@
   THR_ZEROG,
   THR_ZEROA,
 
+#if CONFIG_EXT_INTER
+
+  THR_COMP_NEAREST_NEARESTLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+#endif  // CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTGA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   THR_COMP_NEARESTLA,
 #if CONFIG_EXT_REFS
   THR_COMP_NEARESTL2A,
@@ -101,8 +153,98 @@
   THR_COMP_NEARESTGB,
 #endif  // CONFIG_EXT_REFS
 
+#endif  // CONFIG_EXT_INTER
+
   THR_TM,
 
+#if CONFIG_EXT_INTER
+
+  THR_COMP_NEAR_NEARESTLA,
+  THR_COMP_NEAREST_NEARLA,
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_ZERO_ZEROLA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAR_NEARESTL2A,
+  THR_COMP_NEAREST_NEARL2A,
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_ZERO_ZEROL2A,
+
+  THR_COMP_NEAR_NEARESTL3A,
+  THR_COMP_NEAREST_NEARL3A,
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_ZERO_ZEROL3A,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_NEAR_NEARESTGA,
+  THR_COMP_NEAREST_NEARGA,
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_ZERO_ZEROGA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAR_NEARESTLB,
+  THR_COMP_NEAREST_NEARLB,
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_ZERO_ZEROLB,
+
+  THR_COMP_NEAR_NEARESTL2B,
+  THR_COMP_NEAREST_NEARL2B,
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_ZERO_ZEROL2B,
+
+  THR_COMP_NEAR_NEARESTL3B,
+  THR_COMP_NEAREST_NEARL3B,
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_ZERO_ZEROL3B,
+
+  THR_COMP_NEAR_NEARESTGB,
+  THR_COMP_NEAREST_NEARGB,
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_ZERO_ZEROGB,
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   THR_COMP_NEARLA,
   THR_COMP_NEWLA,
 #if CONFIG_EXT_REFS
@@ -131,6 +273,7 @@
   THR_COMP_ZEROL3A,
 #endif  // CONFIG_EXT_REFS
   THR_COMP_ZEROGA,
+
 #if CONFIG_EXT_REFS
   THR_COMP_ZEROLB,
   THR_COMP_ZEROL2B,
@@ -138,6 +281,8 @@
   THR_COMP_ZEROGB,
 #endif  // CONFIG_EXT_REFS
 
+#endif  // CONFIG_EXT_INTER
+
   THR_H_PRED,
   THR_V_PRED,
   THR_D135_PRED,
@@ -146,6 +291,42 @@
   THR_D63_PRED,
   THR_D117_PRED,
   THR_D45_PRED,
+
+#if CONFIG_EXT_INTER
+  THR_COMP_INTERINTRA_ZEROL,
+  THR_COMP_INTERINTRA_NEARESTL,
+  THR_COMP_INTERINTRA_NEARL,
+  THR_COMP_INTERINTRA_NEWL,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_INTERINTRA_ZEROL2,
+  THR_COMP_INTERINTRA_NEARESTL2,
+  THR_COMP_INTERINTRA_NEARL2,
+  THR_COMP_INTERINTRA_NEWL2,
+
+  THR_COMP_INTERINTRA_ZEROL3,
+  THR_COMP_INTERINTRA_NEARESTL3,
+  THR_COMP_INTERINTRA_NEARL3,
+  THR_COMP_INTERINTRA_NEWL3,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_INTERINTRA_ZEROG,
+  THR_COMP_INTERINTRA_NEARESTG,
+  THR_COMP_INTERINTRA_NEARG,
+  THR_COMP_INTERINTRA_NEWG,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_INTERINTRA_ZEROB,
+  THR_COMP_INTERINTRA_NEARESTB,
+  THR_COMP_INTERINTRA_NEARB,
+  THR_COMP_INTERINTRA_NEWB,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_INTERINTRA_ZEROA,
+  THR_COMP_INTERINTRA_NEARESTA,
+  THR_COMP_INTERINTRA_NEARA,
+  THR_COMP_INTERINTRA_NEWA,
+#endif  // CONFIG_EXT_INTER
 } THR_MODES;
 
 typedef enum {
@@ -153,24 +334,25 @@
 #if CONFIG_EXT_REFS
   THR_LAST2,
   THR_LAST3,
-#endif  // CONFIG_EXT_REFS
-  THR_GOLD,
-#if CONFIG_EXT_REFS
   THR_BWDR,
 #endif  // CONFIG_EXT_REFS
+  THR_GOLD,
   THR_ALTR,
+
   THR_COMP_LA,
 #if CONFIG_EXT_REFS
   THR_COMP_L2A,
   THR_COMP_L3A,
 #endif  // CONFIG_EXT_REFS
   THR_COMP_GA,
+
 #if CONFIG_EXT_REFS
   THR_COMP_LB,
   THR_COMP_L2B,
   THR_COMP_L3B,
   THR_COMP_GB,
 #endif  // CONFIG_EXT_REFS
+
   THR_INTRA,
 } THR_MODES_SUB8X8;
 
@@ -184,7 +366,7 @@
 
   int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
 
-  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
+  int64_t prediction_type_threshes[TOTAL_REFS_PER_FRAME][REFERENCE_MODES];
 
   int RDMULT;
   int RDDIV;
@@ -213,7 +395,7 @@
 void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
                               int qindex);
 
-void av1_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
 int av1_get_switchable_rate(const struct AV1_COMP *cpi, const MACROBLOCKD *xd);
@@ -236,16 +418,20 @@
 
 void av1_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[16],
-                              ENTROPY_CONTEXT t_left[16]);
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]);
 
 void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
 
 void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi);
 
-void av1_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+                               int (*fact)[MAX_MODES], int rd_thresh, int bsize,
                                int best_mode_index);
 
+void av1_fill_token_costs(av1_coeff_cost *c,
+                          av1_coeff_probs_model (*p)[PLANE_TYPES]);
+
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
   return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;

diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 92ef15a..5c3b8fb 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c

@@ -22,6 +22,7 @@
 #include "aom_ports/system_state.h"
 
 #include "av1/common/common.h"
+#include "av1/common/common_data.h"
 #include "av1/common/entropy.h"
 #include "av1/common/entropymode.h"
 #include "av1/common/idct.h"
@@ -47,31 +48,46 @@
 #include "av1/encoder/ratectrl.h"
 #include "av1/encoder/rd.h"
 #include "av1/encoder/rdopt.h"
-
+#include "av1/encoder/tokenize.h"
 #if CONFIG_PVQ
 #include "av1/encoder/pvq_encoder.h"
 #endif
+#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+static const int filter_sets[25][2] = {
+  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, { 0, 4 }, { 1, 0 }, { 1, 1 },
+  { 1, 2 }, { 1, 3 }, { 1, 4 }, { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+  { 2, 4 }, { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }, { 3, 4 }, { 4, 0 },
+  { 4, 1 }, { 4, 2 }, { 4, 3 }, { 4, 4 },
+};
+#else
+static const int filter_sets[9][2] = {
+  { 0, 0 }, { 0, 1 }, { 0, 2 }, { 1, 0 }, { 1, 1 },
+  { 1, 2 }, { 2, 0 }, { 2, 1 }, { 2, 2 },
+};
+#endif
+#endif
 
 #if CONFIG_EXT_REFS
 
 #define LAST_FRAME_MODE_MASK                                      \
   ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))  // NOLINT
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
 #define LAST2_FRAME_MODE_MASK                                    \
   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST3_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))  // NOLINT
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
 #define LAST3_FRAME_MODE_MASK                                    \
   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))  // NOLINT
+   (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
 #define GOLDEN_FRAME_MODE_MASK                                   \
   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))  // NOLINT
+   (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
 #define BWDREF_FRAME_MODE_MASK                                   \
   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))  // NOLINT
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))
 #define ALTREF_FRAME_MODE_MASK                                   \
   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | (1 << LAST2_FRAME) | \
-   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))  // NOLINT
+   (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))
 
 #else
 
@@ -93,7 +109,14 @@
 #define MIN_EARLY_TERM_INDEX 3
 #define NEW_MV_DISCOUNT_FACTOR 8
 
-const double ext_tx_th = 0.99;
+#if CONFIG_EXT_INTRA
+#define ANGLE_FAST_SEARCH 1
+#define ANGLE_SKIP_THRESH 10
+#define FILTER_FAST_SEARCH 1
+#endif  // CONFIG_EXT_INTRA
+
+const double ADST_FLIP_SVM[8] = { -6.6623, -2.8062, -3.2531, 3.1671,    // vert
+                                  -7.7051, -3.2234, -3.6193, 3.4533 };  // horz
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -103,10 +126,10 @@
 typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
 
 struct rdcost_block_args {
-  const AV1_COMMON *cm;
+  const AV1_COMP *cpi;
   MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
+  ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
   int this_rate;
   int64_t this_dist;
   int64_t this_sse;
@@ -149,6 +172,17 @@
   { NEARMV, { ALTREF_FRAME, NONE } },
   { NEARMV, { GOLDEN_FRAME, NONE } },
 
+#if CONFIG_EXT_INTER
+  { NEWFROMNEARMV, { LAST_FRAME, NONE } },
+#if CONFIG_EXT_REFS
+  { NEWFROMNEARMV, { LAST2_FRAME, NONE } },
+  { NEWFROMNEARMV, { LAST3_FRAME, NONE } },
+  { NEWFROMNEARMV, { BWDREF_FRAME, NONE } },
+#endif  // CONFIG_EXT_REFS
+  { NEWFROMNEARMV, { ALTREF_FRAME, NONE } },
+  { NEWFROMNEARMV, { GOLDEN_FRAME, NONE } },
+#endif  // CONFIG_EXT_INTER
+
   { ZEROMV, { LAST_FRAME, NONE } },
 #if CONFIG_EXT_REFS
   { ZEROMV, { LAST2_FRAME, NONE } },
@@ -158,7 +192,23 @@
   { ZEROMV, { GOLDEN_FRAME, NONE } },
   { ZEROMV, { ALTREF_FRAME, NONE } },
 
-  // TODO(zoeliu): May need to reconsider the order on the modes to check
+// TODO(zoeliu): May need to reconsider the order on the modes to check
+
+#if CONFIG_EXT_INTER
+  { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+#if CONFIG_EXT_REFS
+  { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
 
   { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
 #if CONFIG_EXT_REFS
@@ -172,9 +222,97 @@
   { NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 #endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_INTER
 
   { TM_PRED, { INTRA_FRAME, NONE } },
 
+#if CONFIG_EXT_INTER
+  { NEAR_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { NEAR_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+  { NEAR_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { NEAR_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+  { ZERO_ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { NEAR_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+  { NEAR_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+  { ZERO_ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
 #if CONFIG_EXT_REFS
@@ -203,6 +341,7 @@
   { ZEROMV, { LAST3_FRAME, ALTREF_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
 #if CONFIG_EXT_REFS
   { ZEROMV, { LAST_FRAME, BWDREF_FRAME } },
   { ZEROMV, { LAST2_FRAME, BWDREF_FRAME } },
@@ -210,6 +349,8 @@
   { ZEROMV, { GOLDEN_FRAME, BWDREF_FRAME } },
 #endif  // CONFIG_EXT_REFS
 
+#endif  // CONFIG_EXT_INTER
+
   { H_PRED, { INTRA_FRAME, NONE } },
   { V_PRED, { INTRA_FRAME, NONE } },
   { D135_PRED, { INTRA_FRAME, NONE } },
@@ -218,31 +359,424 @@
   { D63_PRED, { INTRA_FRAME, NONE } },
   { D117_PRED, { INTRA_FRAME, NONE } },
   { D45_PRED, { INTRA_FRAME, NONE } },
+
+#if CONFIG_EXT_INTER
+  { ZEROMV, { LAST_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { LAST_FRAME, INTRA_FRAME } },
+  { NEARMV, { LAST_FRAME, INTRA_FRAME } },
+  { NEWMV, { LAST_FRAME, INTRA_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { ZEROMV, { LAST2_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { LAST2_FRAME, INTRA_FRAME } },
+  { NEARMV, { LAST2_FRAME, INTRA_FRAME } },
+  { NEWMV, { LAST2_FRAME, INTRA_FRAME } },
+
+  { ZEROMV, { LAST3_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { LAST3_FRAME, INTRA_FRAME } },
+  { NEARMV, { LAST3_FRAME, INTRA_FRAME } },
+  { NEWMV, { LAST3_FRAME, INTRA_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { ZEROMV, { GOLDEN_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, INTRA_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, INTRA_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, INTRA_FRAME } },
+
+#if CONFIG_EXT_REFS
+  { ZEROMV, { BWDREF_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { BWDREF_FRAME, INTRA_FRAME } },
+  { NEARMV, { BWDREF_FRAME, INTRA_FRAME } },
+  { NEWMV, { BWDREF_FRAME, INTRA_FRAME } },
+#endif  // CONFIG_EXT_REFS
+
+  { ZEROMV, { ALTREF_FRAME, INTRA_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, INTRA_FRAME } },
+  { NEARMV, { ALTREF_FRAME, INTRA_FRAME } },
+  { NEWMV, { ALTREF_FRAME, INTRA_FRAME } },
+#endif  // CONFIG_EXT_INTER
 };
 
 static const REF_DEFINITION av1_ref_order[MAX_REFS] = {
   { { LAST_FRAME, NONE } },
 #if CONFIG_EXT_REFS
   { { LAST2_FRAME, NONE } },          { { LAST3_FRAME, NONE } },
-#endif  // CONFIG_EXT_REFS
-  { { GOLDEN_FRAME, NONE } },
-#if CONFIG_EXT_REFS
   { { BWDREF_FRAME, NONE } },
 #endif  // CONFIG_EXT_REFS
-  { { ALTREF_FRAME, NONE } },         { { LAST_FRAME, ALTREF_FRAME } },
+  { { GOLDEN_FRAME, NONE } },         { { ALTREF_FRAME, NONE } },
+
+  { { LAST_FRAME, ALTREF_FRAME } },
 #if CONFIG_EXT_REFS
   { { LAST2_FRAME, ALTREF_FRAME } },  { { LAST3_FRAME, ALTREF_FRAME } },
 #endif  // CONFIG_EXT_REFS
   { { GOLDEN_FRAME, ALTREF_FRAME } },
+
 #if CONFIG_EXT_REFS
   { { LAST_FRAME, BWDREF_FRAME } },   { { LAST2_FRAME, BWDREF_FRAME } },
   { { LAST3_FRAME, BWDREF_FRAME } },  { { GOLDEN_FRAME, BWDREF_FRAME } },
 #endif  // CONFIG_EXT_REFS
+
   { { INTRA_FRAME, NONE } },
 };
 
+#if CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+static INLINE int write_uniform_cost(int n, int v) {
+  int l = get_unsigned_bits(n), m = (1 << l) - n;
+  if (l == 0) return 0;
+  if (v < m)
+    return (l - 1) * av1_cost_bit(128, 0);
+  else
+    return l * av1_cost_bit(128, 0);
+}
+#endif  // CONFIG_EXT_INTRA || CONFIG_FILTER_INTRA || CONFIG_PALETTE
+
+// constants for prune 1 and prune 2 decision boundaries
+#define FAST_EXT_TX_CORR_MID 0.0
+#define FAST_EXT_TX_EDST_MID 0.1
+#define FAST_EXT_TX_CORR_MARGIN 0.5
+#define FAST_EXT_TX_EDST_MARGIN 0.3
+
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+  DCT_1D,      ADST_1D, DCT_1D,      ADST_1D,
+#if CONFIG_EXT_TX
+  FLIPADST_1D, DCT_1D,  FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
+  DCT_1D,      IDTX_1D, ADST_1D,     IDTX_1D, FLIPADST_1D, IDTX_1D,
+#endif  // CONFIG_EXT_TX
+};
+
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+  DCT_1D,  DCT_1D,      ADST_1D,     ADST_1D,
+#if CONFIG_EXT_TX
+  DCT_1D,  FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
+  IDTX_1D, DCT_1D,      IDTX_1D,     ADST_1D,     IDTX_1D, FLIPADST_1D,
+#endif  // CONFIG_EXT_TX
+};
+
+static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                                         uint8_t *src, int src_stride,
+                                         uint8_t *dst, int dst_stride,
+                                         double *hordist, double *verdist) {
+  int bw = 4 << (b_width_log2_lookup[bsize]);
+  int bh = 4 << (b_height_log2_lookup[bsize]);
+  unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+  unsigned int var[16];
+  double total = 0;
+
+  const int f_index = bsize - BLOCK_16X16;
+  if (f_index < 0) {
+    int i, j, index;
+    int w_shift = bw == 8 ? 1 : 2;
+    int h_shift = bh == 8 ? 1 : 2;
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) {
+      uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (i = 0; i < bh; ++i)
+        for (j = 0; j < bw; ++j) {
+          index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] +=
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+              (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+        }
+    } else {
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+      for (i = 0; i < bh; ++i)
+        for (j = 0; j < bw; ++j) {
+          index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+#if CONFIG_AOM_HIGHBITDEPTH
+    }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  } else {
+    var[0] = cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[0]);
+    var[1] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                     dst_stride, &esq[1]);
+    var[2] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                     dst_stride, &esq[2]);
+    var[3] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                     dst + 3 * bw / 4, dst_stride, &esq[3]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    var[4] = cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[4]);
+    var[5] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                     dst_stride, &esq[5]);
+    var[6] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                     dst_stride, &esq[6]);
+    var[7] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                     dst + 3 * bw / 4, dst_stride, &esq[7]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    var[8] = cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[8]);
+    var[9] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                     dst_stride, &esq[9]);
+    var[10] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                      dst_stride, &esq[10]);
+    var[11] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                      dst + 3 * bw / 4, dst_stride, &esq[11]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    var[12] =
+        cpi->fn_ptr[f_index].vf(src, src_stride, dst, dst_stride, &esq[12]);
+    var[13] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride, dst + bw / 4,
+                                      dst_stride, &esq[13]);
+    var[14] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, dst + bw / 2,
+                                      dst_stride, &esq[14]);
+    var[15] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                      dst + 3 * bw / 4, dst_stride, &esq[15]);
+  }
+
+  total = esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] + esq[6] +
+          esq[7] + esq[8] + esq[9] + esq[10] + esq[11] + esq[12] + esq[13] +
+          esq[14] + esq[15];
+  if (total > 0) {
+    const double e_recip = 1.0 / total;
+    hordist[0] =
+        ((double)esq[0] + (double)esq[4] + (double)esq[8] + (double)esq[12]) *
+        e_recip;
+    hordist[1] =
+        ((double)esq[1] + (double)esq[5] + (double)esq[9] + (double)esq[13]) *
+        e_recip;
+    hordist[2] =
+        ((double)esq[2] + (double)esq[6] + (double)esq[10] + (double)esq[14]) *
+        e_recip;
+    verdist[0] =
+        ((double)esq[0] + (double)esq[1] + (double)esq[2] + (double)esq[3]) *
+        e_recip;
+    verdist[1] =
+        ((double)esq[4] + (double)esq[5] + (double)esq[6] + (double)esq[7]) *
+        e_recip;
+    verdist[2] =
+        ((double)esq[8] + (double)esq[9] + (double)esq[10] + (double)esq[11]) *
+        e_recip;
+  } else {
+    hordist[0] = verdist[0] = 0.25;
+    hordist[1] = verdist[1] = 0.25;
+    hordist[2] = verdist[2] = 0.25;
+  }
+  (void)var[0];
+  (void)var[1];
+  (void)var[2];
+  (void)var[3];
+  (void)var[4];
+  (void)var[5];
+  (void)var[6];
+  (void)var[7];
+  (void)var[8];
+  (void)var[9];
+  (void)var[10];
+  (void)var[11];
+  (void)var[12];
+  (void)var[13];
+  (void)var[14];
+  (void)var[15];
+}
+
+static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize, uint8_t *src,
+                            int src_stride, uint8_t *dst, int dst_stride,
+                            double *hdist, double *vdist) {
+  int prune_bitmask = 0;
+  double svm_proj_h = 0, svm_proj_v = 0;
+  get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride,
+                               hdist, vdist);
+
+  svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
+               vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
+  svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
+               hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
+  if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << FLIPADST_1D;
+  else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << ADST_1D;
+
+  if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (FLIPADST_1D + 8);
+  else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (ADST_1D + 8);
+
+  return prune_bitmask;
+}
+
+#if CONFIG_EXT_TX
+static void get_horver_correlation(int16_t *diff, int stride, int w, int h,
+                                   double *hcorr, double *vcorr) {
+  // Returns hor/ver correlation coefficient
+  const int num = (h - 1) * (w - 1);
+  double num_r;
+  int i, j;
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, y_sum = 0, z_sum = 0;
+  int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
+  double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
+  *hcorr = *vcorr = 1;
+
+  assert(num > 0);
+  num_r = 1.0 / num;
+  for (i = 1; i < h; ++i) {
+    for (j = 1; j < w; ++j) {
+      const int16_t x = diff[i * stride + j];
+      const int16_t y = diff[i * stride + j - 1];
+      const int16_t z = diff[(i - 1) * stride + j];
+      xy_sum += x * y;
+      xz_sum += x * z;
+      x_sum += x;
+      y_sum += y;
+      z_sum += z;
+      x2_sum += x * x;
+      y2_sum += y * y;
+      z2_sum += z * z;
+    }
+  }
+  x_var_n = x2_sum - (x_sum * x_sum) * num_r;
+  y_var_n = y2_sum - (y_sum * y_sum) * num_r;
+  z_var_n = z2_sum - (z_sum * z_sum) * num_r;
+  xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
+  xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
+  if (x_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  }
+  if (x_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
+  }
+}
+
+int dct_vs_idtx(int16_t *diff, int stride, int w, int h, double *hcorr,
+                double *vcorr) {
+  int prune_bitmask = 0;
+  get_horver_correlation(diff, stride, w, h, hcorr, vcorr);
+
+  if (*vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << IDTX_1D;
+  else if (*vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << DCT_1D;
+
+  if (*hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (IDTX_1D + 8);
+  else if (*hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (DCT_1D + 8);
+  return prune_bitmask;
+}
+
+// Performance drop: 0.5%, Speed improvement: 24%
+static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                             MACROBLOCK *x, MACROBLOCKD *xd, int adst_flipadst,
+                             int dct_idtx) {
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+  const int bw = 4 << (b_width_log2_lookup[bs]);
+  const int bh = 4 << (b_height_log2_lookup[bs]);
+  double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
+  double hcorr, vcorr;
+  int prune = 0;
+  av1_subtract_plane(x, bsize, 0);
+
+  if (adst_flipadst)
+    prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
+                              pd->dst.buf, pd->dst.stride, hdist, vdist);
+  if (dct_idtx) prune |= dct_vs_idtx(p->src_diff, bw, bw, bh, &hcorr, &vcorr);
+
+  return prune;
+}
+#endif  // CONFIG_EXT_TX
+
+// Performance drop: 0.3%, Speed improvement: 5%
+static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+                             MACROBLOCK *x, MACROBLOCKD *xd) {
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
+  av1_subtract_plane(x, bsize, 0);
+  return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
+                          pd->dst.stride, hdist, vdist);
+}
+
+static int prune_tx_types(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+                          MACROBLOCKD *xd, int tx_set) {
+#if CONFIG_EXT_TX
+  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+#else
+  const int tx_set_1D[TX_TYPES_1D] = { 0 };
+#endif
+
+  switch (cpi->sf.tx_type_search.prune_mode) {
+    case NO_PRUNE: return 0; break;
+    case PRUNE_ONE:
+      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
+        return 0;
+      return prune_one_for_sby(cpi, bsize, x, xd);
+      break;
+#if CONFIG_EXT_TX
+    case PRUNE_TWO:
+      if ((tx_set >= 0) && !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return 0;
+        return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      }
+      if ((tx_set >= 0) && !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
+        return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+      return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+      break;
+#endif
+  }
+  assert(0);
+  return 0;
+}
+
+static int do_tx_type_search(TX_TYPE tx_type, int prune) {
+// TODO(sarahparker) implement for non ext tx
+#if CONFIG_EXT_TX
+  return !(((prune >> vtx_tab[tx_type]) & 1) |
+           ((prune >> (htx_tab[tx_type] + 8)) & 1));
+#else
+  // temporary to avoid compiler warnings
+  (void)vtx_tab;
+  (void)htx_tab;
+  (void)tx_type;
+  (void)prune;
+  return 1;
+#endif
+}
+
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+                              const MACROBLOCKD *const xd, BLOCK_SIZE bsize,
+                              int plane, int64_t sse, int *rate,
+                              int64_t *dist) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dequant_shift =
+#if CONFIG_AOM_HIGHBITDEPTH
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+                                                    3;
+
+  // Fast approximate the modelling function.
+  if (cpi->sf.simple_model_rd_from_var) {
+    const int64_t square_error = sse;
+    int quantizer = (pd->dequant[1] >> dequant_shift);
+
+    if (quantizer < 120)
+      *rate = (int)((square_error * (280 - quantizer)) >>
+                    (16 - AV1_PROB_COST_SHIFT));
+    else
+      *rate = 0;
+    *dist = (square_error * quantizer) >> 8;
+  } else {
+    av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
+                                 pd->dequant[1] >> dequant_shift, rate, dist);
+  }
+
+  *dist <<= 4;
+}
+
 static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
-                            MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum,
+                            MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+                            int plane_to, int *out_rate_sum,
                             int64_t *out_dist_sum, int *skip_txfm_sb,
                             int64_t *skip_sse_sb) {
   // Note our transform coeffs are 8 times an orthogonal transform.
@@ -254,15 +788,10 @@
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
   int64_t total_sse = 0;
-  const int dequant_shift =
-#if CONFIG_AOM_HIGHBITDEPTH
-      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-                                                    3;
 
   x->pred_sse[ref] = 0;
 
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+  for (plane = plane_from; plane <= plane_to; ++plane) {
     struct macroblock_plane *const p = &x->plane[plane];
     struct macroblockd_plane *const pd = &xd->plane[plane];
     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
@@ -280,22 +809,8 @@
 
     total_sse += sse;
 
-    // Fast approximate the modelling function.
-    if (cpi->sf.simple_model_rd_from_var) {
-      const int64_t square_error = sse;
-      const int quantizer = (pd->dequant[1] >> dequant_shift);
-      const int64_t rate_temp =
-          (quantizer < 120)
-              ? (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT)
-              : 0;
-      assert(rate_temp == (int)rate_temp);
-      rate = (int)rate_temp;
-      dist = (square_error * quantizer) >> 8;
-    } else {
-      av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bs],
-                                   pd->dequant[1] >> dequant_shift, &rate,
-                                   &dist);
-    }
+    model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
+
     rate_sum += rate;
     dist_sum += dist;
   }
@@ -303,7 +818,7 @@
   *skip_txfm_sb = total_sse == 0;
   *skip_sse_sb = total_sse << 4;
   *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum << 4;
+  *out_dist_sum = dist_sum;
 }
 
 #if CONFIG_PVQ
@@ -385,172 +900,333 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #if !CONFIG_PVQ
-/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+/* The trailing '0' is a terminator which is used inside av1_cost_coeffs() to
  * decide whether to include cost of a trailing EOB node or not (i.e. we
  * can skip this if the last coefficient in this transform block, e.g. the
  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
  * were non-zero). */
-static const int16_t band_counts[TX_SIZES][8] = {
-#if CONFIG_CB4X4
-  {
-      1, 2, 2, 3, 0, 0, 0,
-  },
-#endif
-  { 1, 2, 3, 4, 3, 16 - 13, 0 },
-  { 1, 2, 3, 4, 11, 64 - 21, 0 },
-  { 1, 2, 3, 4, 11, 256 - 21, 0 },
-  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-};
-
-static int cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
-                       int block, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
-                       TX_SIZE tx_size, const int16_t *scan, const int16_t *nb,
-                       int use_fast_coef_costing) {
+int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
+                    int block, int coeff_ctx, TX_SIZE tx_size,
+                    const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
   const struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *band_count = &band_counts[tx_size][1];
+  const uint16_t *band_count = &band_count_table[tx_size][1];
   const int eob = p->eobs[block];
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const int tx_size_ctx = txsize_sqr_map[tx_size];
   unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      x->token_costs[tx_size][type][is_inter_block(mbmi)];
-  uint8_t token_cache[32 * 32];
-  int pt = combine_entropy_contexts(*A, *L);
+      x->token_costs[tx_size_ctx][type][is_inter_block(mbmi)];
+  uint8_t token_cache[MAX_TX_SQUARE];
+  int pt = coeff_ctx;
   int c, cost;
 #if CONFIG_AOM_HIGHBITDEPTH
   const int *cat6_high_cost = av1_get_high_cost_table(xd->bd);
 #else
   const int *cat6_high_cost = av1_get_high_cost_table(8);
 #endif
-  (void)cm;
 
+#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi, pd) == tx_size);
+#endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
+  (void)cm;
 
   if (eob == 0) {
     // single eob token
     cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
-    int band_left = *band_count++;
+    if (use_fast_coef_costing) {
+      int band_left = *band_count++;
 
-    // dc token
-    int v = qcoeff[0];
-    int16_t prev_t;
-    EXTRABIT e;
-    av1_get_token_extra(v, &prev_t, &e);
-    cost =
-        (*token_costs)[0][pt][prev_t] + av1_get_cost(prev_t, e, cat6_high_cost);
+      // dc token
+      int v = qcoeff[0];
+      int16_t prev_t;
+      cost = av1_get_token_cost(v, &prev_t, cat6_high_cost);
+      cost += (*token_costs)[0][pt][prev_t];
 
-    token_cache[0] = av1_pt_energy_class[prev_t];
-    ++token_costs;
+      token_cache[0] = av1_pt_energy_class[prev_t];
+      ++token_costs;
 
-    // ac tokens
-    for (c = 1; c < eob; c++) {
-      const int rc = scan[c];
-      int16_t t;
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+        int16_t t;
 
-      v = qcoeff[rc];
-      av1_get_token_extra(v, &t, &e);
-      if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] +
-                av1_get_cost(t, e, cat6_high_cost);
-      } else {
+        v = qcoeff[rc];
+        cost += av1_get_token_cost(v, &t, cat6_high_cost);
+        cost += (*token_costs)[!prev_t][!prev_t][t];
+        prev_t = t;
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+      }
+
+      // eob token
+      if (band_left) cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
+
+    } else {  // !use_fast_coef_costing
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t tok;
+      unsigned int(*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+      cost = av1_get_token_cost(v, &tok, cat6_high_cost);
+      cost += (*token_costs)[0][pt][tok];
+
+      token_cache[0] = av1_pt_energy_class[tok];
+      ++token_costs;
+
+      tok_cost_ptr = &((*token_costs)[!tok]);
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+
+        v = qcoeff[rc];
+        cost += av1_get_token_cost(v, &tok, cat6_high_cost);
         pt = get_coef_context(nb, token_cache, c);
-        cost +=
-            (*token_costs)[!prev_t][pt][t] + av1_get_cost(t, e, cat6_high_cost);
-        token_cache[rc] = av1_pt_energy_class[t];
+        cost += (*tok_cost_ptr)[pt][tok];
+        token_cache[rc] = av1_pt_energy_class[tok];
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+        tok_cost_ptr = &((*token_costs)[!tok]);
       }
-      prev_t = t;
-      if (!--band_left) {
-        band_left = *band_count++;
-        ++token_costs;
-      }
-    }
 
-    // eob token
-    if (band_left) {
-      if (use_fast_coef_costing) {
-        cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-      } else {
+      // eob token
+      if (band_left) {
         pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[0][pt][EOB_TOKEN];
       }
     }
   }
 
-  // is eob first coefficient;
-  *A = *L = (c > 0);
-
   return cost;
 }
 #endif
 
-static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+static void dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block,
+                       int blk_row, int blk_col, TX_SIZE tx_size,
                        int64_t *out_dist, int64_t *out_sse) {
-  const int ss_txfrm_size = 1 << (tx_size_1d_log2[tx_size] << 1);
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  int64_t this_sse;
-  int shift = tx_size == TX_32X32 ? 0 : 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  if (cpi->sf.use_transform_domain_distortion) {
+    // Transform domain distortion computation is more accurate as it does
+    // not involve an inverse transform, but it is less accurate.
+    const int buffer_length = tx_size_2d[tx_size];
+    int64_t this_sse;
+    int tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+    int shift = (MAX_TX_SCALE - get_tx_scale(xd, tx_type, tx_size)) * 2;
+    tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 #if CONFIG_PVQ
-  tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
+    tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
 #endif
 #if CONFIG_AOM_HIGHBITDEPTH
-  const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-  *out_dist =
-      av1_highbd_block_error(coeff, dqcoeff, ss_txfrm_size, &this_sse, bd) >>
-      shift;
+    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+    *out_dist =
+        av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, bd) >>
+        shift;
 #elif CONFIG_PVQ
-  *out_dist =
-      av1_block_error2_c(coeff, dqcoeff, ref_coeff, ss_txfrm_size, &this_sse) >>
-      shift;
+    *out_dist = av1_block_error2_c(coeff, dqcoeff, ref_coeff, buffer_length,
+                                   &this_sse) >>
+                shift;
 #else
-  *out_dist =
-      av1_block_error(coeff, dqcoeff, ss_txfrm_size, &this_sse) >> shift;
+    *out_dist =
+        av1_block_error(coeff, dqcoeff, buffer_length, &this_sse) >> shift;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-  *out_sse = this_sse >> shift;
+    *out_sse = this_sse >> shift;
+  } else {
+    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+    const int bsw = block_size_wide[tx_bsize];
+    const int bsh = block_size_high[tx_bsize];
+    const int src_stride = x->plane[plane].src.stride;
+    const int dst_stride = xd->plane[plane].dst.stride;
+    // Scale the transform block index to pixel unit.
+    const int src_idx = (blk_row * src_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const int dst_idx = (blk_row * dst_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+    const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    const uint16_t eob = p->eobs[block];
+
+    unsigned int tmp;
+
+    assert(cpi != NULL);
+    assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+    *out_sse = (int64_t)tmp * 16;
+
+    if (eob) {
+      const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_AOM_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+      uint8_t *recon = (uint8_t *)recon16;
+#else
+      DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+      const PLANE_TYPE plane_type = plane == 0 ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+
+      INV_TXFM_PARAM inv_txfm_param;
+
+      inv_txfm_param.tx_type = get_tx_type(plane_type, xd, block, tx_size);
+      inv_txfm_param.tx_size = tx_size;
+      inv_txfm_param.eob = eob;
+      inv_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        recon = CONVERT_TO_BYTEPTR(recon);
+        inv_txfm_param.bd = xd->bd;
+        aom_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0,
+                                 NULL, 0, bsw, bsh, xd->bd);
+        highbd_inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
+      } else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      {
+        aom_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE, NULL, 0, NULL, 0,
+                          bsw, bsh);
+        inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
+      }
+
+      cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, MAX_TX_SIZE, &tmp);
+    }
+
+    *out_dist = (int64_t)tmp * 16;
+  }
 }
 
 #if !CONFIG_PVQ
-static int rate_block(int plane, int block, int blk_row, int blk_col,
-                      TX_SIZE tx_size, struct rdcost_block_args *args) {
-  return cost_coeffs(args->cm, args->x, plane, block, args->t_above + blk_col,
-                     args->t_left + blk_row, tx_size, args->scan_order->scan,
-                     args->scan_order->neighbors, args->use_fast_coef_costing);
+static int rate_block(int plane, int block, int coeff_ctx, TX_SIZE tx_size,
+                      struct rdcost_block_args *args) {
+  return av1_cost_coeffs(&args->cpi->common, args->x, plane, block, coeff_ctx,
+                         tx_size, args->scan_order->scan,
+                         args->scan_order->neighbors,
+                         args->use_fast_coef_costing);
 }
 #endif
 
+static uint64_t sum_squares_2d(const int16_t *diff, int diff_stride,
+                               TX_SIZE tx_size) {
+  uint64_t sse;
+  switch (tx_size) {
+#if CONFIG_EXT_TX
+    case TX_4X8:
+      sse = aom_sum_squares_2d_i16(diff, diff_stride, 4) +
+            aom_sum_squares_2d_i16(diff + 4 * diff_stride, diff_stride, 4);
+      break;
+    case TX_8X4:
+      sse = aom_sum_squares_2d_i16(diff, diff_stride, 4) +
+            aom_sum_squares_2d_i16(diff + 4, diff_stride, 4);
+      break;
+    case TX_8X16:
+      sse = aom_sum_squares_2d_i16(diff, diff_stride, 8) +
+            aom_sum_squares_2d_i16(diff + 8 * diff_stride, diff_stride, 8);
+      break;
+    case TX_16X8:
+      sse = aom_sum_squares_2d_i16(diff, diff_stride, 8) +
+            aom_sum_squares_2d_i16(diff + 8, diff_stride, 8);
+      break;
+    case TX_16X32:
+      sse = aom_sum_squares_2d_i16(diff, diff_stride, 16) +
+            aom_sum_squares_2d_i16(diff + 16 * diff_stride, diff_stride, 16);
+      break;
+    case TX_32X16:
+      sse = aom_sum_squares_2d_i16(diff, diff_stride, 16) +
+            aom_sum_squares_2d_i16(diff + 16, diff_stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(tx_size < TX_SIZES);
+      sse = aom_sum_squares_2d_i16(diff, diff_stride, tx_size_wide[tx_size]);
+      break;
+  }
+  return sse;
+}
+
 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                           BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const AV1_COMMON *const cm = args->cm;
+  const AV1_COMMON *cm = &args->cpi->common;
   int64_t rd1, rd2, rd;
   int rate;
   int64_t dist;
   int64_t sse;
 
+  int coeff_ctx = combine_entropy_contexts(*(args->t_above + blk_col),
+                                           *(args->t_left + blk_row));
+
   if (args->exit_early) return;
 
   if (!is_inter_block(mbmi)) {
-    struct encode_b_args b_args = { (AV1_COMMON *)cm, x, NULL, &mbmi->skip };
+    struct encode_b_args b_args = {
+      (AV1_COMMON *)cm, x, NULL, &mbmi->skip, args->t_above, args->t_left, 1
+    };
     av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
                            &b_args);
-    dist_block(x, plane, block, tx_size, &dist, &sse);
+
+    if (args->cpi->sf.use_transform_domain_distortion) {
+      dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
+                 &sse);
+    } else {
+      // Note that the encode block_intra call above already calls
+      // inv_txfm_add, so we can't just call dist_block here.
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+      const aom_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
+
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+      const int src_stride = p->src.stride;
+      const int dst_stride = pd->dst.stride;
+      const int diff_stride = block_size_wide[plane_bsize];
+
+      const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+      unsigned int tmp;
+      sse = sum_squares_2d(diff, diff_stride, tx_size);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      sse = (int64_t)sse * 16;
+
+      variance(src, src_stride, dst, dst_stride, &tmp);
+      dist = (int64_t)tmp * 16;
+    }
   } else {
-    // full forward transform and quantization
-    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
-                    tx_size);
-    dist_block(x, plane, block, tx_size, &dist, &sse);
+// full forward transform and quantization
+#if CONFIG_NEW_QUANT
+    av1_xform_quant_fp_nuq(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                           tx_size, coeff_ctx);
+#else
+    av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                    AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+    if (x->plane[plane].eobs[block])
+      av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+    dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist,
+               &sse);
   }
 
   rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
@@ -559,7 +1235,9 @@
     return;
   }
 #if !CONFIG_PVQ
-  rate = rate_block(plane, block, blk_row, blk_col, tx_size, args);
+  rate = rate_block(plane, block, coeff_ctx, tx_size, args);
+  args->t_above[blk_col] = (x->plane[plane].eobs[block] > 0);
+  args->t_left[blk_row] = (x->plane[plane].eobs[block] > 0);
 #else
   rate = x->rate;
 #endif
@@ -585,28 +1263,29 @@
 #endif
 }
 
-static void txfm_rd_in_plane(const AV1_COMMON *const cm, MACROBLOCK *x,
-                             int *rate, int64_t *distortion, int *skippable,
-                             int64_t *sse, int64_t ref_best_rd, int plane,
-                             BLOCK_SIZE bsize, TX_SIZE tx_size,
-                             int use_fast_coef_casting) {
+static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
+                             int64_t *distortion, int *skippable, int64_t *sse,
+                             int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
+                             TX_SIZE tx_size, int use_fast_coef_casting) {
+  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   TX_TYPE tx_type;
   struct rdcost_block_args args;
   av1_zero(args);
   args.x = x;
+  args.cpi = cpi;
   args.best_rd = ref_best_rd;
   args.use_fast_coef_costing = use_fast_coef_casting;
   args.skippable = 1;
-  args.cm = cm;
 
   if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
 
   av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-  tx_type = get_tx_type(pd->plane_type, xd, 0);
-  args.scan_order = get_scan(cm, tx_size, tx_type);
+  tx_type = get_tx_type(pd->plane_type, xd, 0, tx_size);
+  args.scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
 
   av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
                                          &args);
@@ -623,16 +1302,268 @@
   }
 }
 
+#if CONFIG_SUPERTX
+void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
+                                  int64_t *distortion, int *skippable,
+                                  int64_t *sse, int64_t ref_best_rd, int plane,
+                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                  int use_fast_coef_casting) {
+  const AV1_COMMON *cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  TX_TYPE tx_type;
+
+  av1_zero(args);
+  args.cpi = cpi;
+  args.x = x;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
+
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
+  if (plane == 0) xd->mi[0]->mbmi.tx_size = tx_size;
+
+  av1_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  tx_type = get_tx_type(pd->plane_type, xd, 0, tx_size);
+  args.scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+
+  block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd), tx_size,
+                &args);
+
+  if (args.exit_early) {
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate = args.this_rate;
+    *sse = args.this_sse;
+    *skippable = !x->plane[plane].eobs[0];
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, int *r,
+                        int64_t *d, int *s, int64_t *sse, int64_t ref_best_rd,
+                        BLOCK_SIZE bs, TX_TYPE tx_type, int tx_size) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+  int s0, s1;
+  const int is_inter = is_inter_block(mbmi);
+  const int tx_size_ctx = get_tx_size_context(xd);
+  const int tx_size_cat =
+      is_inter ? inter_tx_size_cat_lookup[bs] : intra_tx_size_cat_lookup[bs];
+  const TX_SIZE coded_tx_size = txsize_sqr_up_map[tx_size];
+  const int depth = tx_size_to_depth(coded_tx_size);
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  const int r_tx_size = cpi->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+
+  assert(skip_prob > 0);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  s0 = av1_cost_bit(skip_prob, 0);
+  s1 = av1_cost_bit(skip_prob, 1);
+
+  mbmi->tx_type = tx_type;
+  mbmi->tx_size = tx_size;
+  txfm_rd_in_plane(x, cpi, r, d, s, sse, ref_best_rd, 0, bs, tx_size,
+                   cpi->sf.use_fast_coef_costing);
+  if (*r == INT_MAX) return INT64_MAX;
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(tx_size, bs, is_inter) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    const int ext_tx_set = get_ext_tx_set(tx_size, bs, is_inter);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        *r +=
+            cpi->inter_tx_type_costs[ext_tx_set][txsize_sqr_map[mbmi->tx_size]]
+                                    [mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        *r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->mode]
+                                      [mbmi->tx_type];
+    }
+  }
+#else
+  if (tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+      !FIXED_TX_TYPE) {
+    if (is_inter) {
+      *r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+    } else {
+      *r += cpi->intra_tx_type_costs[mbmi->tx_size]
+                                    [intra_mode_to_tx_type_context[mbmi->mode]]
+                                    [mbmi->tx_type];
+    }
+  }
+#endif  // CONFIG_EXT_TX
+
+  if (*s) {
+    if (is_inter) {
+      rd = RDCOST(x->rdmult, x->rddiv, s1, *sse);
+    } else {
+      rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, *sse);
+    }
+  } else {
+    rd = RDCOST(x->rdmult, x->rddiv, *r + s0 + r_tx_size * tx_select, *d);
+  }
+
+  if (tx_select) *r += r_tx_size;
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !(*s))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, *sse));
+
+  return rd;
+}
+
+static int64_t choose_tx_size_fix_type(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                       MACROBLOCK *x, int *rate,
+                                       int64_t *distortion, int *skip,
+                                       int64_t *psse, int64_t ref_best_rd,
+                                       TX_TYPE tx_type, int prune) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int r, s;
+  int64_t d, sse;
+  int64_t rd = INT64_MAX;
+  int n;
+  int start_tx, end_tx;
+  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE best_tx_size = max_tx_size;
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+#if CONFIG_RECT_TX
+  int evaluate_rect_tx = 0;
+#endif  // CONFIG_RECT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+
+  if (tx_select) {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    evaluate_rect_tx = is_rect_tx_allowed(xd, mbmi);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+    start_tx = max_tx_size;
+    end_tx = (max_tx_size == TX_32X32) ? TX_8X8 : TX_4X4;
+  } else {
+    const TX_SIZE chosen_tx_size =
+        tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    evaluate_rect_tx = is_rect_tx(chosen_tx_size);
+    assert(IMPLIES(evaluate_rect_tx, is_rect_tx_allowed(xd, mbmi)));
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+    start_tx = chosen_tx_size;
+    end_tx = chosen_tx_size;
+  }
+
+  *distortion = INT64_MAX;
+  *rate = INT_MAX;
+  *skip = 0;
+  *psse = INT64_MAX;
+
+  mbmi->tx_type = tx_type;
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (evaluate_rect_tx) {
+    const TX_SIZE rect_tx_size = max_txsize_rect_lookup[bs];
+    ext_tx_set = get_ext_tx_set(rect_tx_size, bs, 1);
+    if (ext_tx_used_inter[ext_tx_set][tx_type]) {
+      rd = txfm_yrd(cpi, x, &r, &d, &s, &sse, ref_best_rd, bs, tx_type,
+                    rect_tx_size);
+      best_tx_size = rect_tx_size;
+      best_rd = rd;
+      *distortion = d;
+      *rate = r;
+      *skip = s;
+      *psse = sse;
+    }
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  last_rd = INT64_MAX;
+  for (n = start_tx; n >= end_tx; --n) {
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+    if (is_rect_tx(n)) break;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+    if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n)) continue;
+    if (!is_inter && x->use_default_intra_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, n))
+      continue;
+    if (is_inter && x->use_default_inter_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, n))
+      continue;
+    if (max_tx_size == TX_32X32 && n == TX_4X4) continue;
+#if CONFIG_EXT_TX
+    ext_tx_set = get_ext_tx_set(n, bs, is_inter);
+    if (is_inter) {
+      if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+        if (!do_tx_type_search(tx_type, prune)) continue;
+      }
+    } else {
+      if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
+      }
+      if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
+    }
+#else   // CONFIG_EXT_TX
+    if (n >= TX_32X32 && tx_type != DCT_DCT) continue;
+    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+        !do_tx_type_search(tx_type, prune))
+      continue;
+#endif  // CONFIG_EXT_TX
+
+    rd = txfm_yrd(cpi, x, &r, &d, &s, &sse, ref_best_rd, bs, tx_type, n);
+
+    // Early termination in transform size search.
+    if (cpi->sf.tx_size_search_breakout &&
+        (rd == INT64_MAX || (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
+         (n < (int)max_tx_size && rd > last_rd)))
+      break;
+
+    last_rd = rd;
+    if (rd < best_rd) {
+      best_tx_size = n;
+      best_rd = rd;
+      *distortion = d;
+      *rate = r;
+      *skip = s;
+      *psse = sse;
+    }
+  }
+  mbmi->tx_size = best_tx_size;
+
+  return best_rd;
+}
+
+#if CONFIG_EXT_INTER
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+                                   MACROBLOCK *x, int *r, int64_t *d, int *s,
+                                   int64_t *sse, int64_t ref_best_rd) {
+  return txfm_yrd(cpi, x, r, d, s, sse, ref_best_rd, bs, DCT_DCT,
+                  max_txsize_lookup[bs]);
+}
+#endif  // CONFIG_EXT_INTER
+
 static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
                                    int *rate, int64_t *distortion, int *skip,
                                    int64_t *sse, int64_t ref_best_rd,
                                    BLOCK_SIZE bs) {
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   const AV1_COMMON *const cm = &cpi->common;
-  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-
   TX_TYPE tx_type, best_tx_type = DCT_DCT;
   int r, s;
   int64_t d, psse, this_rd, best_rd = INT64_MAX;
@@ -640,15 +1571,32 @@
   int s0 = av1_cost_bit(skip_prob, 0);
   int s1 = av1_cost_bit(skip_prob, 1);
   const int is_inter = is_inter_block(mbmi);
-
+  int prune = 0;
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
   *distortion = INT64_MAX;
   *rate = INT_MAX;
   *skip = 0;
   *sse = INT64_MAX;
 
-  mbmi->tx_size = AOMMIN(max_tx_size, largest_tx_size);
+  mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode, is_inter);
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(mbmi->tx_size, bs, is_inter);
+#endif  // CONFIG_EXT_TX
 
-  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+    prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
+#else
+    prune = prune_tx_types(cpi, bs, x, xd, 0);
+#endif
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1 &&
+      !xd->lossless[mbmi->segment_id]) {
 #if CONFIG_PVQ
     od_rollback_buffer pre_buf, post_buf;
 
@@ -657,27 +1605,52 @@
 #endif
 
     for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+      if (is_inter) {
+        if (x->use_default_inter_tx_type &&
+            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+          continue;
+        if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+          if (!do_tx_type_search(tx_type, prune)) continue;
+        }
+      } else {
+        if (x->use_default_intra_tx_type &&
+            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+          continue;
+        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
+        }
+        if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
+      }
+
       mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(cm, x, &r, &d, &s, &psse, ref_best_rd, 0, bs,
+
+      txfm_rd_in_plane(x, cpi, &r, &d, &s, &psse, ref_best_rd, 0, bs,
                        mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 #if CONFIG_PVQ
       od_encode_rollback(&x->daala_enc, &pre_buf);
 #endif
       if (r == INT_MAX) continue;
-      if (is_inter)
-        r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-      else
-        r += cpi->intra_tx_type_costs[mbmi->tx_size]
-                                     [intra_mode_to_tx_type_context[mbmi->mode]]
-                                     [mbmi->tx_type];
+      if (get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1) {
+        if (is_inter) {
+          if (ext_tx_set > 0)
+            r += cpi->inter_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                         [mbmi->tx_type];
+        } else {
+          if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+            r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size][mbmi->mode]
+                                         [mbmi->tx_type];
+        }
+      }
+
       if (s)
         this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
       else
         this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
-      if (is_inter && !xd->lossless[mbmi->segment_id] && !s)
+      if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] && !s)
         this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
 
-      if (this_rd < ((best_tx_type == DCT_DCT) ? ext_tx_th : 1) * best_rd) {
+      if (this_rd < best_rd) {
         best_rd = this_rd;
         best_tx_type = mbmi->tx_type;
         *distortion = d;
@@ -693,9 +1666,55 @@
     od_encode_rollback(&x->daala_enc, &post_buf);
 #endif
   } else {
-    txfm_rd_in_plane(cm, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
+    mbmi->tx_type = DCT_DCT;
+    txfm_rd_in_plane(x, cpi, rate, distortion, skip, sse, ref_best_rd, 0, bs,
                      mbmi->tx_size, cpi->sf.use_fast_coef_costing);
   }
+#else   // CONFIG_EXT_TX
+  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id]) {
+    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (!is_inter && x->use_default_intra_tx_type &&
+          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+        continue;
+      if (is_inter && x->use_default_inter_tx_type &&
+          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+        continue;
+      mbmi->tx_type = tx_type;
+      txfm_rd_in_plane(x, cpi, &r, &d, &s, &psse, ref_best_rd, 0, bs,
+                       mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+      if (r == INT_MAX) continue;
+      if (is_inter) {
+        r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+            !do_tx_type_search(tx_type, prune))
+          continue;
+      } else {
+        r += cpi->intra_tx_type_costs[mbmi->tx_size]
+                                     [intra_mode_to_tx_type_context[mbmi->mode]]
+                                     [mbmi->tx_type];
+      }
+      if (s)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
+      if (is_inter && !xd->lossless[mbmi->segment_id] && !s)
+        this_rd = AOMMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
+
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
+        best_tx_type = mbmi->tx_type;
+        *distortion = d;
+        *rate = r;
+        *skip = s;
+        *sse = psse;
+      }
+    }
+  } else {
+    mbmi->tx_type = DCT_DCT;
+    txfm_rd_in_plane(x, cpi, rate, distortion, skip, sse, ref_best_rd, 0, bs,
+                     mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+  }
+#endif  // CONFIG_EXT_TX
   mbmi->tx_type = best_tx_type;
 }
 
@@ -705,52 +1724,40 @@
                                     BLOCK_SIZE bs) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  const AV1_COMMON *const cm = &cpi->common;
 
   mbmi->tx_size = TX_4X4;
   mbmi->tx_type = DCT_DCT;
-  txfm_rd_in_plane(cm, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = get_min_tx_size(TX_4X4);
+#endif
+
+  txfm_rd_in_plane(x, cpi, rate, distortion, skip, sse, ref_best_rd, 0, bs,
                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
 }
 
-static void choose_tx_size_from_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                   int *rate, int64_t *distortion, int *skip,
-                                   int64_t *psse, int64_t ref_best_rd,
-                                   BLOCK_SIZE bs) {
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  const AV1_COMMON *const cm = &cpi->common;
+static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+                                        MACROBLOCK *x, int *rate,
+                                        int64_t *distortion, int *skip,
+                                        int64_t *psse, int64_t ref_best_rd,
+                                        BLOCK_SIZE bs) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
   int r, s;
   int64_t d, sse;
   int64_t rd = INT64_MAX;
-  int n, m;
-  int s0, s1;
-  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
-  TX_SIZE best_tx = TX_SIZES;
-  int start_tx, end_tx;
-  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = max_txsize_lookup[bs];
   const int is_inter = is_inter_block(mbmi);
-  const aom_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int prune = 0;
 
 #if CONFIG_PVQ
   od_rollback_buffer buf;
 #endif
-  assert(skip_prob > 0);
-  s0 = av1_cost_bit(skip_prob, 0);
-  s1 = av1_cost_bit(skip_prob, 1);
-
-  if (tx_select) {
-    start_tx = max_tx_size;
-    end_tx = (max_tx_size == TX_32X32) ? TX_8X8 : TX_4X4;
-  } else {
-    const TX_SIZE chosen_tx_size =
-        AOMMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
-    start_tx = chosen_tx_size;
-    end_tx = chosen_tx_size;
-  }
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+    // passing -1 in for tx_type indicates that all 1D
+    // transforms should be considered for pruning
+    prune = prune_tx_types(cpi, bs, x, xd, -1);
 
   *distortion = INT64_MAX;
   *rate = INT_MAX;
@@ -765,80 +1772,32 @@
 #if CONFIG_REF_MV
     if (mbmi->ref_mv_idx > 0 && tx_type != DCT_DCT) continue;
 #endif
-
-    last_rd = INT64_MAX;
-    for (n = start_tx; n >= end_tx; --n) {
-      int r_tx_size = 0;
-      for (m = TX_4X4; m <= n - (n == (int)max_tx_size); ++m) {
-        if (m == n)
-          r_tx_size += av1_cost_zero(tx_probs[m]);
-        else
-          r_tx_size += av1_cost_one(tx_probs[m]);
-      }
-
-      if (n >= TX_32X32 && tx_type != DCT_DCT) {
-        continue;
-      }
-      mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(cm, x, &r, &d, &s, &sse, ref_best_rd, 0, bs, n,
-                       cpi->sf.use_fast_coef_costing);
-#if CONFIG_PVQ
-      od_encode_rollback(&x->daala_enc, &buf);
-#endif
-      if (n < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-          r != INT_MAX) {
-        if (is_inter)
-          r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-        else
-          r += cpi->intra_tx_type_costs
-                   [mbmi->tx_size][intra_mode_to_tx_type_context[mbmi->mode]]
-                   [mbmi->tx_type];
-      }
-
-      if (r == INT_MAX) continue;
-
-      if (s) {
-        if (is_inter) {
-          rd = RDCOST(x->rdmult, x->rddiv, s1, sse);
-        } else {
-          rd = RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, sse);
-        }
-      } else {
-        rd = RDCOST(x->rdmult, x->rddiv, r + s0 + r_tx_size * tx_select, d);
-      }
-
-      if (tx_select && !(s && is_inter)) r += r_tx_size;
-
-      if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !s)
-        rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, sse));
-
-      // Early termination in transform size search.
-      if (cpi->sf.tx_size_search_breakout &&
-          (rd == INT64_MAX || (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
-           (n < (int)max_tx_size && rd > last_rd)))
-        break;
-
-      last_rd = rd;
-      if (rd <
-          (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) * best_rd) {
-        best_tx = n;
-        best_rd = rd;
-        *distortion = d;
-        *rate = r;
-        *skip = s;
-        *psse = sse;
-        best_tx_type = mbmi->tx_type;
-      }
+    rd = choose_tx_size_fix_type(cpi, bs, x, &r, &d, &s, &sse, ref_best_rd,
+                                 tx_type, prune);
+    if (rd < best_rd) {
+      best_rd = rd;
+      *distortion = d;
+      *rate = r;
+      *skip = s;
+      *psse = sse;
+      best_tx_type = tx_type;
+      best_tx = mbmi->tx_size;
     }
   }
 
   mbmi->tx_size = best_tx;
   mbmi->tx_type = best_tx_type;
 
+#if CONFIG_VAR_TX
+  mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif
+
+#if !CONFIG_EXT_TX
   if (mbmi->tx_size >= TX_32X32) assert(mbmi->tx_type == DCT_DCT);
+#endif
 #if CONFIG_PVQ
   if (best_tx < TX_SIZES)
-    txfm_rd_in_plane(cm, x, &r, &d, &s, &sse, ref_best_rd, 0, bs, best_tx,
+    txfm_rd_in_plane(x, cpi, &r, &d, &s, &sse, ref_best_rd, 0, bs, best_tx,
                      cpi->sf.use_fast_coef_costing);
 #endif
 }
@@ -852,16 +1811,15 @@
 
   assert(bs == xd->mi[0]->mbmi.sb_type);
 
-  if (xd->lossless[0]) {
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
     choose_smallest_tx_size(cpi, x, rate, distortion, skip, ret_sse,
                             ref_best_rd, bs);
-  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
-             xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
                            bs);
   } else {
-    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
-                           bs);
+    choose_tx_size_type_from_rd(cpi, x, rate, distortion, skip, ret_sse,
+                                ref_best_rd, bs);
   }
 }
 
@@ -882,15 +1840,6 @@
   return 0;
 }
 
-#if CONFIG_EXT_INTRA || CONFIG_PALETTE
-static INLINE int write_uniform_cost(int n, int v) {
-  const int l = get_unsigned_bits(n), m = (1 << l) - n;
-  if (l == 0) return 0;
-  return (v < m) ? ((l - 1) * av1_cost_bit(128, 0))
-                 : (l * av1_cost_bit(128, 0));
-}
-#endif  // CONFIG_EXT_INTRA || CONFIG_PALETTE
-
 #if CONFIG_PALETTE
 static int rd_pick_palette_intra_sby(
     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int palette_ctx,
@@ -917,6 +1866,9 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
     colors = av1_count_colors(src, src_stride, rows, cols);
   palette_mode_info->palette_size[0] = 0;
+#if CONFIG_FILTER_INTRA
+  mic->mbmi.filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
 
   if (colors > 1 && colors <= 64) {
     int r, c, i, j, k;
@@ -965,6 +1917,9 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
     mbmi->mode = DC_PRED;
+#if CONFIG_FILTER_INTRA
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
 
     if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return 0;
 
@@ -1027,13 +1982,11 @@
 }
 #endif  // CONFIG_PALETTE
 
-static int64_t rd_pick_intra4x4block(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                     int row, int col,
-                                     PREDICTION_MODE *best_mode,
-                                     const int *bmode_costs, ENTROPY_CONTEXT *a,
-                                     ENTROPY_CONTEXT *l, int *bestrate,
-                                     int *bestratey, int64_t *bestdistortion,
-                                     BLOCK_SIZE bsize, int64_t rd_thresh) {
+static int64_t rd_pick_intra4x4block(
+    const AV1_COMP *const cpi, MACROBLOCK *x, int row, int col,
+    PREDICTION_MODE *best_mode, const int *bmode_costs, ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, int64_t *bestdistortion,
+    BLOCK_SIZE bsize, int *y_skip, int64_t rd_thresh) {
 #if !CONFIG_PVQ
   const AV1_COMMON *const cm = &cpi->common;
 #endif
@@ -1051,6 +2004,7 @@
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
+  int best_can_skip = 0;
   uint8_t best_dst[8 * 8];
 #if CONFIG_AOM_HIGHBITDEPTH
   uint16_t best_dst16[8 * 8];
@@ -1076,6 +2030,7 @@
       int ratey = 0;
       int64_t distortion = 0;
       int rate = bmode_costs[mode];
+      int can_skip = 1;
 
       if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue;
 
@@ -1095,47 +2050,64 @@
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
           int16_t *const src_diff =
               av1_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
           xd->mi[0]->bmi[block].as_mode = mode;
-          av1_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride, dst,
-                                  dst_stride, col + idx, row + idy, 0);
+          av1_predict_intra_block(xd, pd->width, pd->height, TX_4X4, mode, dst,
+                                  dst_stride, dst, dst_stride, col + idx,
+                                  row + idy, 0);
           aom_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst,
                                     dst_stride, xd->bd);
           if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-            const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
-            av1_highbd_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
-            av1_regular_quantize_b_4x4(x, 0, block, scan_order->scan,
-                                       scan_order->iscan);
-            ratey +=
-                cost_coeffs(cm, x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                            scan_order->scan, scan_order->neighbors,
-                            cpi->sf.use_fast_coef_costing);
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+            const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type, 0);
+            const int coeff_ctx =
+                combine_entropy_contexts(*(tempa + idx), *(templ + idy));
+#if CONFIG_NEW_QUANT
+            av1_xform_quant_fp_nuq(cm, x, 0, block, row + idy, col + idx,
+                                   BLOCK_8X8, TX_4X4, coeff_ctx);
+#else
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            TX_4X4, AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+            ratey += av1_cost_coeffs(cm, x, 0, block, coeff_ctx, TX_4X4,
+                                     scan_order->scan, scan_order->neighbors,
+                                     cpi->sf.use_fast_coef_costing);
+            *(tempa + idx) = !(p->eobs[block] == 0);
+            *(templ + idy) = !(p->eobs[block] == 0);
+            can_skip &= (p->eobs[block] == 0);
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
               goto next_highbd;
             av1_highbd_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
                                         dst_stride, p->eobs[block], xd->bd,
                                         DCT_DCT, 1);
           } else {
-            int64_t unused;
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-            const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
-            av1_highbd_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
-            av1_regular_quantize_b_4x4(x, 0, block, scan_order->scan,
-                                       scan_order->iscan);
-            ratey +=
-                cost_coeffs(cm, x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                            scan_order->scan, scan_order->neighbors,
-                            cpi->sf.use_fast_coef_costing);
-            distortion +=
-                av1_highbd_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                                       16, &unused, xd->bd) >>
-                2;
-            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-              goto next_highbd;
+            int64_t dist;
+            unsigned int tmp;
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+            const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type, 0);
+            const int coeff_ctx =
+                combine_entropy_contexts(*(tempa + idx), *(templ + idy));
+#if CONFIG_NEW_QUANT
+            av1_xform_quant_fp_nuq(cm, x, 0, block, row + idy, col + idx,
+                                   BLOCK_8X8, TX_4X4, coeff_ctx);
+#else
+            av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                            TX_4X4, AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+            av1_optimize_b(cm, x, 0, block, TX_4X4, coeff_ctx);
+            ratey += av1_cost_coeffs(cm, x, 0, block, coeff_ctx, TX_4X4,
+                                     scan_order->scan, scan_order->neighbors,
+                                     cpi->sf.use_fast_coef_costing);
+            *(tempa + idx) = !(p->eobs[block] == 0);
+            *(templ + idy) = !(p->eobs[block] == 0);
+            can_skip &= (p->eobs[block] == 0);
             av1_highbd_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
                                         dst_stride, p->eobs[block], xd->bd,
                                         tx_type, 0);
+            cpi->fn_ptr[BLOCK_4X4].vf(src, src_stride, dst, dst_stride, &tmp);
+            dist = (int64_t)tmp << 4;
+            distortion += dist;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
           }
         }
       }
@@ -1148,6 +2120,7 @@
         *bestratey = ratey;
         *bestdistortion = distortion;
         best_rd = this_rd;
+        best_can_skip = can_skip;
         *best_mode = mode;
         memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
         memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
@@ -1159,8 +2132,11 @@
       }
     next_highbd : {}
     }
+
     if (best_rd >= rd_thresh) return best_rd;
 
+    if (y_skip) *y_skip &= best_can_skip;
+
     for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
       memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
              best_dst16 + idy * 8, num_4x4_blocks_wide * 4 * sizeof(uint16_t));
@@ -1179,6 +2155,7 @@
     int ratey = 0;
     int64_t distortion = 0;
     int rate = bmode_costs[mode];
+    int can_skip = 1;
 
     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode))) continue;
 
@@ -1196,25 +2173,26 @@
         const int block = (row + idy) * 2 + (col + idx);
         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
 #if !CONFIG_PVQ
         int16_t *const src_diff =
             av1_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
 #else
         int lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
         const int diff_stride = 8;
-        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[0].dqcoeff, block);
         tran_low_t *ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, block);
         int16_t *pred = &pd->pred[4 * (row * diff_stride + col)];
         int16_t *src_int16 = &p->src_int16[4 * (row * diff_stride + col)];
         int i, j, tx_blk_size;
-        TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
+        TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
         int rate_pvq;
         int skip;
 #endif
         xd->mi[0]->bmi[block].as_mode = mode;
-        av1_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride, dst,
-                                dst_stride, col + idx, row + idy, 0);
+        av1_predict_intra_block(xd, pd->width, pd->height, TX_4X4, mode, dst,
+                                dst_stride, dst, dst_stride, col + idx,
+                                row + idy, 0);
 #if !CONFIG_PVQ
         aom_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 #else
@@ -1229,20 +2207,37 @@
             src_int16[diff_stride * j + i] = src[src_stride * j + i];
             pred[diff_stride * j + i] = dst[dst_stride * j + i];
           }
-        av1_fwd_txfm_4x4(src_int16, coeff, diff_stride, tx_type, lossless);
-        av1_fwd_txfm_4x4(pred, ref_coeff, diff_stride, tx_type, lossless);
+        {
+          FWD_TXFM_PARAM fwd_txfm_param;
+          fwd_txfm_param.tx_type = tx_type;
+          fwd_txfm_param.tx_size = TX_4X4;
+          fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+          fwd_txfm_param.rd_transform = 0;
+          fwd_txfm_param.lossless = lossless;
+          fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+          fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+        }
 #endif
 
         if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
 #if !CONFIG_PVQ
-          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-          const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
-          av1_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
-          av1_regular_quantize_b_4x4(x, 0, block, scan_order->scan,
-                                     scan_order->iscan);
-          ratey += cost_coeffs(cm, x, 0, block, tempa + idx, templ + idy,
-                               TX_4X4, scan_order->scan, scan_order->neighbors,
-                               cpi->sf.use_fast_coef_costing);
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+          const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type, 0);
+          const int coeff_ctx =
+              combine_entropy_contexts(*(tempa + idx), *(templ + idy));
+#if CONFIG_NEW_QUANT
+          av1_xform_quant_fp_nuq(cm, x, 0, block, row + idy, col + idx,
+                                 BLOCK_8X8, TX_4X4, coeff_ctx);
+#else
+          av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                          TX_4X4, AV1_XFORM_QUANT_B);
+#endif  // CONFIG_NEW_QUANT
+          ratey += av1_cost_coeffs(cm, x, 0, block, coeff_ctx, TX_4X4,
+                                   scan_order->scan, scan_order->neighbors,
+                                   cpi->sf.use_fast_coef_costing);
+          *(tempa + idx) = !(p->eobs[block] == 0);
+          *(templ + idy) = !(p->eobs[block] == 0);
+          can_skip &= (p->eobs[block] == 0);
 #else
           skip = av1_pvq_encode_helper(&x->daala_enc, coeff, ref_coeff, dqcoeff,
                                        &p->eobs[block], pd->dequant, 0, TX_4X4,
@@ -1262,16 +2257,24 @@
           }
 #endif
         } else {
-          int64_t unused;
+          int64_t dist;
+          unsigned int tmp;
 #if !CONFIG_PVQ
-          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-          const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
-          av1_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
-          av1_regular_quantize_b_4x4(x, 0, block, scan_order->scan,
-                                     scan_order->iscan);
-          ratey += cost_coeffs(cm, x, 0, block, tempa + idx, templ + idy,
-                               TX_4X4, scan_order->scan, scan_order->neighbors,
-                               cpi->sf.use_fast_coef_costing);
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+          const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type, 0);
+          const int coeff_ctx =
+              combine_entropy_contexts(*(tempa + idx), *(templ + idy));
+#if CONFIG_NEW_QUANT
+          av1_xform_quant_fp_nuq(cm, x, 0, block, row + idy, col + idx,
+                                 BLOCK_8X8, TX_4X4, coeff_ctx);
+#else
+          av1_xform_quant(cm, x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                          TX_4X4, AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+          av1_optimize_b(cm, x, 0, block, TX_4X4, coeff_ctx);
+          ratey += av1_cost_coeffs(cm, x, 0, block, coeff_ctx, TX_4X4,
+                                   scan_order->scan, scan_order->neighbors,
+                                   cpi->sf.use_fast_coef_costing);
 #else
           skip = av1_pvq_encode_helper(&x->daala_enc, coeff, ref_coeff, dqcoeff,
                                        &p->eobs[block], pd->dequant, 0, TX_4X4,
@@ -1279,9 +2282,14 @@
           ratey += rate_pvq;
 #endif
           // No need for av1_block_error2_c because the ssz is unused
-          distortion += av1_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                                        16, &unused) >>
-                        2;
+          av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
+                               dst_stride, p->eobs[block], tx_type, 0);
+          cpi->fn_ptr[BLOCK_4X4].vf(src, src_stride, dst, dst_stride, &tmp);
+          dist = (int64_t)tmp << 4;
+          distortion += dist;
+          // To use the pixel domain distortion, the step below needs to be
+          // put behind the inv txfm. Compared to calculating the distortion
+          // in the frequency domain, the overhead of encoding effort is low.
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
 #if CONFIG_PVQ
@@ -1289,14 +2297,12 @@
             for (j = 0; j < tx_blk_size; j++)
               for (i = 0; i < tx_blk_size; i++) dst[j * dst_stride + i] = 0;
 #endif
-            av1_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block), dst,
-                                 dst_stride, p->eobs[block], tx_type, 0);
 #if CONFIG_PVQ
           }
 #endif
         }
       }
-    }  // idy loop
+    }
 
     rate += ratey;
     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
@@ -1306,6 +2312,7 @@
       *bestratey = ratey;
       *bestdistortion = distortion;
       best_rd = this_rd;
+      best_can_skip = can_skip;
       *best_mode = mode;
       memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
       memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
@@ -1328,6 +2335,8 @@
   od_encode_rollback(&x->daala_enc, &post_buf);
 #endif
 
+  if (y_skip) *y_skip &= best_can_skip;
+
   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
     memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
            num_4x4_blocks_wide * 4);
@@ -1338,7 +2347,7 @@
 static int64_t rd_pick_intra_sub_8x8_y_mode(const AV1_COMP *const cpi,
                                             MACROBLOCK *mb, int *rate,
                                             int *rate_y, int64_t *distortion,
-                                            int64_t best_rd) {
+                                            int *y_skip, int64_t best_rd) {
   int i, j;
   const MACROBLOCKD *const xd = &mb->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
@@ -1352,11 +2361,21 @@
   int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  const int *bmode_costs = cpi->mbmode_cost;
+  const int *bmode_costs = cpi->mbmode_cost[0];
 
 #if CONFIG_EXT_INTRA
-  mic->mbmi.intra_angle_delta[0] = 0;
+  mic->mbmi.intra_filter = INTRA_FILTER_LINEAR;
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  mic->mbmi.filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
+
+  // TODO(any): Add search of the tx_type to improve rd performance at the
+  // expense of speed.
+  mic->mbmi.tx_type = DCT_DCT;
+  mic->mbmi.tx_size = TX_4X4;
+
+  if (y_skip) *y_skip = 1;
 
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
@@ -1375,7 +2394,7 @@
       this_rd = rd_pick_intra4x4block(
           cpi, mb, idy, idx, &best_mode, bmode_costs,
           xd->plane[0].above_context + idx, xd->plane[0].left_context + idy, &r,
-          &ry, &d, bsize, best_rd - total_rd);
+          &ry, &d, bsize, y_skip, best_rd - total_rd);
       if (this_rd >= best_rd - total_rd) return INT64_MAX;
 
       total_rd += this_rd;
@@ -1392,159 +2411,280 @@
       if (total_rd >= best_rd) return INT64_MAX;
     }
   }
+  mic->mbmi.mode = mic->bmi[3].as_mode;
+
+  // Add in the cost of the transform type
+  if (!xd->lossless[mic->mbmi.segment_id]) {
+    int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(TX_4X4, bsize, 0) > 1) {
+      const int eset = get_ext_tx_set(TX_4X4, bsize, 0);
+      rate_tx_type = cpi->intra_tx_type_costs[eset][TX_4X4][mic->mbmi.mode]
+                                             [mic->mbmi.tx_type];
+    }
+#else
+    rate_tx_type =
+        cpi->intra_tx_type_costs[TX_4X4]
+                                [intra_mode_to_tx_type_context[mic->mbmi.mode]]
+                                [mic->mbmi.tx_type];
+#endif
+    assert(mic->mbmi.tx_size == TX_4X4);
+    cost += rate_tx_type;
+    tot_rate_y += rate_tx_type;
+  }
 
   *rate = cost;
   *rate_y = tot_rate_y;
   *distortion = total_distortion;
-  mic->mbmi.mode = mic->bmi[3].as_mode;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
 
-#if CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, int mode_cost,
+                                    int64_t *best_rd, uint16_t skip_mask) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int this_rate, this_rate_tokenonly, s;
+  int filter_intra_selected_flag = 0;
+  int64_t this_distortion, this_rd;
+  FILTER_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_4X4;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  TX_TYPE best_tx_type;
 
-static int64_t pick_intra_angle_routine_sby(
-    const AV1_COMP *const cpi, MACROBLOCK *x, int8_t angle_delta,
-    int max_angle_delta, int *rate, int *rate_tokenonly, int64_t *distortion,
-    int *skippable, int8_t *best_angle_delta, TX_SIZE *best_tx_size,
-    TX_TYPE *best_tx_type, BLOCK_SIZE bsize, int mode_cost, int64_t *best_rd,
-    int64_t best_rd_in) {
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 1;
+  mbmi->mode = DC_PRED;
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+#endif  // CONFIG_PALETTE
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    if (skip_mask & (1 << mode)) continue;
+    mbmi->filter_intra_mode_info.filter_intra_mode[0] = mode;
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
+                    bsize, *best_rd);
+    if (this_rate_tokenonly == INT_MAX) continue;
+
+    this_rate = this_rate_tokenonly +
+                av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 1) +
+                write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      best_tx_size = mic->mbmi.tx_size;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      best_tx_type = mic->mbmi.tx_type;
+      *rate = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion = this_distortion;
+      *skippable = s;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] =
+        filter_intra_mode_info.use_filter_intra_mode[0];
+    mbmi->filter_intra_mode_info.filter_intra_mode[0] =
+        filter_intra_mode_info.filter_intra_mode[0];
+    mbmi->tx_type = best_tx_type;
+    return 1;
+  } else {
+    return 0;
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+static void pick_intra_angle_routine_sby(
+    const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly,
+    int64_t *distortion, int *skippable, int *best_angle_delta,
+    TX_SIZE *best_tx_size, TX_TYPE *best_tx_type, INTRA_FILTER *best_filter,
+    BLOCK_SIZE bsize, int rate_overhead, int64_t *best_rd) {
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
-
-  mbmi->intra_angle_delta[0] = angle_delta;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
   super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                  bsize, best_rd_in);
-  if (this_rate_tokenonly == INT_MAX) return INT64_MAX;
+                  bsize, *best_rd);
+  if (this_rate_tokenonly == INT_MAX) return;
 
-  this_rate = this_rate_tokenonly + mode_cost +
-              write_uniform_cost(2 * max_angle_delta + 1,
-                                 mbmi->intra_angle_delta[0] + max_angle_delta);
+  this_rate = this_rate_tokenonly + rate_overhead;
   this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
-    *best_angle_delta = mbmi->intra_angle_delta[0];
+    *best_angle_delta = mbmi->angle_delta[0];
     *best_tx_size = mbmi->tx_size;
+    *best_filter = mbmi->intra_filter;
     *best_tx_type = mbmi->tx_type;
     *rate = this_rate;
     *rate_tokenonly = this_rate_tokenonly;
     *distortion = this_distortion;
     *skippable = s;
   }
-  return this_rd;
 }
 
 static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
-                                       BLOCK_SIZE bsize, int mode_cost,
+                                       BLOCK_SIZE bsize, int rate_overhead,
                                        int64_t best_rd) {
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
-  const int max_angle_delta =
-      av1_max_angle_delta_y[max_txsize_lookup[bsize]][mbmi->mode];
-  int i;
-  int8_t angle_delta, best_angle_delta = 0;
-  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  TX_SIZE best_tx_size = mbmi->tx_size;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int this_rate, this_rate_tokenonly, s;
+  int angle_delta, best_angle_delta = 0, p_angle;
+  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+  INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
+  const double rd_adjust = 1.2;
+  int64_t this_distortion, this_rd;
+  TX_SIZE best_tx_size = mic->mbmi.tx_size;
   TX_TYPE best_tx_type = mbmi->tx_type;
 
-  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = { 0, -2, 2 };
+    int deltas_level2[3][2] = {
+      { -1, 1 }, { -3, -1 }, { 1, 3 },
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
 
-  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    if (angle_delta > max_angle_delta) continue;
-    for (i = 0; i < 2; ++i) {
-      best_rd_in = (best_rd == INT64_MAX)
-                       ? INT64_MAX
-                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
-      this_rd = pick_intra_angle_routine_sby(
-          cpi, x, (1 - 2 * i) * angle_delta, max_angle_delta, rate,
-          rate_tokenonly, distortion, skippable, &best_angle_delta,
-          &best_tx_size, &best_tx_type, bsize, mode_cost, &best_rd, best_rd_in);
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (angle_delta == 0) {
-        if (this_rd == INT64_MAX) return best_rd;
-        rd_cost[1] = this_rd;
-        break;
+    for (i = 0; i < level1; ++i) {
+      mic->mbmi.angle_delta[0] = deltas_level1[i];
+      p_angle =
+          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+      for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+        int64_t tmp_best_rd;
+        if ((FILTER_FAST_SEARCH || !av1_is_intra_filter_switchable(p_angle)) &&
+            filter != INTRA_FILTER_LINEAR)
+          continue;
+        mic->mbmi.intra_filter = filter;
+        tmp_best_rd =
+            (i == 0 && filter == INTRA_FILTER_LINEAR && best_rd < INT64_MAX)
+                ? (int64_t)(best_rd * rd_adjust)
+                : best_rd;
+        super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+                        NULL, bsize, tmp_best_rd);
+        if (this_rate_tokenonly == INT_MAX) {
+          if (i == 0 && filter == INTRA_FILTER_LINEAR)
+            return best_rd;
+          else
+            continue;
+        }
+        this_rate = this_rate_tokenonly + rate_overhead +
+                    cpi->intra_filter_cost[intra_filter_ctx][filter];
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+        if (i == 0 && filter == INTRA_FILTER_LINEAR && best_rd < INT64_MAX &&
+            this_rd > best_rd * rd_adjust)
+          return best_rd;
+        if (this_rd < best_rd) {
+          best_i = i;
+          best_rd = this_rd;
+          best_angle_delta = mbmi->angle_delta[0];
+          best_tx_size = mbmi->tx_size;
+          best_filter = mbmi->intra_filter;
+          best_tx_type = mbmi->tx_type;
+          *rate = this_rate;
+          *rate_tokenonly = this_rate_tokenonly;
+          *distortion = this_distortion;
+          *skippable = s;
+        }
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mic->mbmi.angle_delta[0] = deltas_level2[best_i][j];
+        p_angle =
+            mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+        for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+          mic->mbmi.intra_filter = filter;
+          if ((FILTER_FAST_SEARCH ||
+               !av1_is_intra_filter_switchable(p_angle)) &&
+              filter != INTRA_FILTER_LINEAR)
+            continue;
+          pick_intra_angle_routine_sby(
+              cpi, x, rate, rate_tokenonly, distortion, skippable,
+              &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
+              bsize,
+              rate_overhead + cpi->intra_filter_cost[intra_filter_ctx][filter],
+              &best_rd);
+        }
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+         ++angle_delta) {
+      mbmi->angle_delta[0] = angle_delta;
+      p_angle =
+          mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+      for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+        mic->mbmi.intra_filter = filter;
+        if ((FILTER_FAST_SEARCH || !av1_is_intra_filter_switchable(p_angle)) &&
+            filter != INTRA_FILTER_LINEAR)
+          continue;
+        pick_intra_angle_routine_sby(
+            cpi, x, rate, rate_tokenonly, distortion, skippable,
+            &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
+            bsize,
+            rate_overhead + cpi->intra_filter_cost[intra_filter_ctx][filter],
+            &best_rd);
       }
     }
   }
 
-  assert(best_rd != INT64_MAX);
-  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
-    int skip_search;
-    int64_t rd_thresh;
-    if (angle_delta > max_angle_delta) continue;
-    for (i = 0; i < 2; ++i) {
-      skip_search = 0;
-      rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        this_rd = pick_intra_angle_routine_sby(
-            cpi, x, (1 - 2 * i) * angle_delta, max_angle_delta, rate,
-            rate_tokenonly, distortion, skippable, &best_angle_delta,
-            &best_tx_size, &best_tx_type, bsize, mode_cost, &best_rd, best_rd);
+  if (FILTER_FAST_SEARCH && *rate_tokenonly < INT_MAX) {
+    mbmi->angle_delta[0] = best_angle_delta;
+    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+    if (av1_is_intra_filter_switchable(p_angle)) {
+      for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
+        mic->mbmi.intra_filter = filter;
+        pick_intra_angle_routine_sby(
+            cpi, x, rate, rate_tokenonly, distortion, skippable,
+            &best_angle_delta, &best_tx_size, &best_tx_type, &best_filter,
+            bsize,
+            rate_overhead + cpi->intra_filter_cost[intra_filter_ctx][filter],
+            &best_rd);
       }
     }
   }
 
   mbmi->tx_size = best_tx_size;
-  mbmi->intra_angle_delta[0] = best_angle_delta;
+  mbmi->angle_delta[0] = best_angle_delta;
+  mic->mbmi.intra_filter = best_filter;
   mbmi->tx_type = best_tx_type;
-
   return best_rd;
 }
 
 // Indices are sign, integer, and fractional part of the gradient value
 static const uint8_t gradient_to_angle_bin[2][7][16] = {
   {
-      {
-          6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0,
-      },
-      {
-          0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
-      },
-      {
-          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      },
-      {
-          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      },
-      {
-          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      },
-      {
-          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      },
-      {
-          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      },
+      { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
   },
   {
-      {
-          6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
-      },
-      {
-          4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
-      },
-      {
-          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-      },
-      {
-          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-      },
-      {
-          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-      },
-      {
-          3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      },
-      {
-          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      },
+      { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
+      { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+      { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+      { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
   },
 };
 
@@ -1552,13 +2692,9 @@
   0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
 };
 
-// Use gradient analysis to calculate angle histogram. Prediction modes
-// corresponding to angles of small percentage will be marked in the mask.
-static void angle_estimation(const uint8_t *src, const int src_stride,
-                             const int rows, const int cols,
-                             uint8_t *directional_mode_skip_mask) {
+static void angle_estimation(const uint8_t *src, int src_stride, int rows,
+                             int cols, uint8_t *directional_mode_skip_mask) {
   int i, r, c, index, dx, dy, temp, sn, remd, quot;
-  const int angle_skip_thresh = 10;
   uint64_t hist[DIRECTIONAL_MODES];
   uint64_t hist_sum = 0;
 
@@ -1599,19 +2735,17 @@
         score += hist[angle_bin + 1];
         ++weight;
       }
-      if (score * angle_skip_thresh < hist_sum * weight) {
+      if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
         directional_mode_skip_mask[i] = 1;
-      }
     }
   }
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
-static void highbd_angle_estimation(const uint8_t *src8, const int src_stride,
-                                    const int rows, const int cols,
+static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
+                                    int rows, int cols,
                                     uint8_t *directional_mode_skip_mask) {
   int i, r, c, index, dx, dy, temp, sn, remd, quot;
-  const int angle_skip_thresh = 10;
   uint64_t hist[DIRECTIONAL_MODES];
   uint64_t hist_sum = 0;
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
@@ -1653,7 +2787,7 @@
         score += hist[angle_bin + 1];
         ++weight;
       }
-      if (score * angle_skip_thresh < hist_sum * weight)
+      if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
         directional_mode_skip_mask[i] = 1;
     }
   }
@@ -1666,10 +2800,10 @@
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
                                       BLOCK_SIZE bsize, int64_t best_rd) {
-  PREDICTION_MODE mode;
+  uint8_t mode_idx;
   PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *const mic = xd->mi[0];
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
@@ -1678,17 +2812,23 @@
   const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
 #endif  // CONFIG_EXT_INTRA || CONFIG_PALETTE
 #if CONFIG_EXT_INTRA
-  int8_t best_angle_delta = 0;
+  const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
+  INTRA_FILTER best_filter = INTRA_FILTER_LINEAR;
   uint8_t directional_mode_skip_mask[INTRA_MODES];
   const int src_stride = x->plane[0].src.stride;
   const uint8_t *src = x->plane[0].src.buf;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  int beat_best_rd = 0;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+  uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
+#endif  // CONFIG_FILTER_INTRA
   TX_TYPE best_tx_type = DCT_DCT;
   const int *bmode_costs;
 #if CONFIG_PALETTE
   PALETTE_MODE_INFO palette_mode_info;
-  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  PALETTE_MODE_INFO *const pmi = &mic->mbmi.palette_mode_info;
   uint8_t *best_palette_color_map =
       cpi->common.allow_screen_content_tools
           ? x->palette_buffer->best_palette_color_map
@@ -1697,17 +2837,20 @@
 #endif  // CONFIG_PALETTE
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
-  const PREDICTION_MODE A = av1_above_block_mode(xd->mi[0], above_mi, 0);
-  const PREDICTION_MODE L = av1_left_block_mode(xd->mi[0], left_mi, 0);
+  const PREDICTION_MODE A = av1_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = av1_left_block_mode(mic, left_mi, 0);
+  const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
 #if CONFIG_PVQ
   od_rollback_buffer pre_buf, post_buf;
 
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
   od_encode_checkpoint(&x->daala_enc, &post_buf);
 #endif
-
   bmode_costs = cpi->y_mode_costs[A][L];
+
 #if CONFIG_EXT_INTRA
+  mic->mbmi.angle_delta[0] = 0;
   memset(directional_mode_skip_mask, 0,
          sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -1718,7 +2861,10 @@
 #endif
     angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask);
 #endif  // CONFIG_EXT_INTRA
-
+#if CONFIG_FILTER_INTRA
+  filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mic->mbmi.filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_PALETTE
   palette_mode_info.palette_size[0] = 0;
   pmi->palette_size[0] = 0;
@@ -1728,22 +2874,37 @@
     palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
 #endif  // CONFIG_PALETTE
 
-  /* Y Search for intra prediction mode */
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    mbmi->mode = mode;
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
 
+  /* Y Search for intra prediction mode */
+  for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+    if (mode_idx == FINAL_MODE_SEARCH) {
+      if (x->use_default_intra_tx_type == 0) break;
+      mic->mbmi.mode = mode_selected;
+      x->use_default_intra_tx_type = 0;
+    } else {
+      mic->mbmi.mode = mode_idx;
+    }
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &pre_buf);
 #endif
 #if CONFIG_EXT_INTRA
-    if (is_directional_mode(mbmi->mode)) {
-      if (directional_mode_skip_mask[mbmi->mode]) continue;
+    is_directional_mode =
+        (mic->mbmi.mode != DC_PRED && mic->mbmi.mode != TM_PRED);
+    if (is_directional_mode && directional_mode_skip_mask[mic->mbmi.mode])
+      continue;
+    if (is_directional_mode) {
+      rate_overhead = bmode_costs[mic->mbmi.mode] +
+                      write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
       this_rate_tokenonly = INT_MAX;
-      this_rd = rd_pick_intra_angle_sby(
-          cpi, x, &this_rate, &this_rate_tokenonly, &this_distortion, &s, bsize,
-          bmode_costs[mbmi->mode], best_rd);
+      this_rd = rd_pick_intra_angle_sby(cpi, x, &this_rate,
+                                        &this_rate_tokenonly, &this_distortion,
+                                        &s, bsize, rate_overhead, best_rd);
     } else {
-      mbmi->intra_angle_delta[0] = 0;
+      mic->mbmi.angle_delta[0] = 0;
       super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
                       bsize, best_rd);
     }
@@ -1754,32 +2915,58 @@
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
-    this_rate = this_rate_tokenonly + bmode_costs[mode];
+    this_rate = this_rate_tokenonly + bmode_costs[mic->mbmi.mode];
+
+    if (!xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+      // super_block_yrd above includes the cost of the tx_size in the
+      // tokenonly rate, but for intra blocks, tx_size is always coded
+      // (prediction granularity), so we account for it in the full rate,
+      // not the tokenonly rate.
+      this_rate_tokenonly -=
+          cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                           [tx_size_to_depth(mic->mbmi.tx_size)];
+    }
 #if CONFIG_PALETTE
-    if (cpi->common.allow_screen_content_tools && mode == DC_PRED)
+    if (cpi->common.allow_screen_content_tools && mic->mbmi.mode == DC_PRED)
       this_rate += av1_cost_bit(
           av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
 #endif  // CONFIG_PALETTE
-
+#if CONFIG_FILTER_INTRA
+    if (mic->mbmi.mode == DC_PRED)
+      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[0], 0);
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_EXT_INTRA
-    if (is_directional_mode(mbmi->mode)) {
-      const int max_angle_delta =
-          av1_max_angle_delta_y[max_tx_size][mbmi->mode];
+    if (is_directional_mode) {
+      int p_angle;
       this_rate +=
-          write_uniform_cost(2 * max_angle_delta + 1,
-                             max_angle_delta + mbmi->intra_angle_delta[0]);
+          write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                             MAX_ANGLE_DELTAS + mic->mbmi.angle_delta[0]);
+      p_angle = mode_to_angle_map[mic->mbmi.mode] +
+                mic->mbmi.angle_delta[0] * ANGLE_STEP;
+      if (av1_is_intra_filter_switchable(p_angle))
+        this_rate +=
+            cpi->intra_filter_cost[intra_filter_ctx][mic->mbmi.intra_filter];
     }
 #endif  // CONFIG_EXT_INTRA
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+#if CONFIG_FILTER_INTRA
+    if (best_rd == INT64_MAX || this_rd - best_rd < (best_rd >> 4)) {
+      filter_intra_mode_skip_mask ^= (1 << mic->mbmi.mode);
+    }
+#endif  // CONFIG_FILTER_INTRA
 
     if (this_rd < best_rd) {
-      mode_selected = mode;
+      mode_selected = mic->mbmi.mode;
       best_rd = this_rd;
-      best_tx = mbmi->tx_size;
-      best_tx_type = mbmi->tx_type;
+      best_tx = mic->mbmi.tx_size;
 #if CONFIG_EXT_INTRA
-      best_angle_delta = mbmi->intra_angle_delta[0];
+      best_angle_delta = mic->mbmi.angle_delta[0];
+      best_filter = mic->mbmi.intra_filter;
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      beat_best_rd = 1;
+#endif  // CONFIG_FILTER_INTRA
+      best_tx_type = mic->mbmi.tx_type;
       *rate = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion = this_distortion;
@@ -1802,13 +2989,36 @@
                               &best_rd);
 #endif  // CONFIG_PALETTE
 
-  mbmi->mode = mode_selected;
-  mbmi->tx_size = best_tx;
-  mbmi->tx_type = best_tx_type;
-#if CONFIG_EXT_INTRA
-  mbmi->intra_angle_delta[0] = best_angle_delta;
-#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  if (beat_best_rd) {
+    if (rd_pick_filter_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+                                 skippable, bsize, bmode_costs[DC_PRED],
+                                 &best_rd, filter_intra_mode_skip_mask)) {
+      mode_selected = mic->mbmi.mode;
+      best_tx = mic->mbmi.tx_size;
+      filter_intra_mode_info = mic->mbmi.filter_intra_mode_info;
+      best_tx_type = mic->mbmi.tx_type;
+    }
+  }
 
+  mic->mbmi.filter_intra_mode_info.use_filter_intra_mode[0] =
+      filter_intra_mode_info.use_filter_intra_mode[0];
+  if (filter_intra_mode_info.use_filter_intra_mode[0]) {
+    mic->mbmi.filter_intra_mode_info.filter_intra_mode[0] =
+        filter_intra_mode_info.filter_intra_mode[0];
+#if CONFIG_PALETTE
+    palette_mode_info.palette_size[0] = 0;
+#endif  // CONFIG_PALETTE
+  }
+#endif  // CONFIG_FILTER_INTRA
+
+  mic->mbmi.mode = mode_selected;
+  mic->mbmi.tx_size = best_tx;
+#if CONFIG_EXT_INTRA
+  mic->mbmi.angle_delta[0] = best_angle_delta;
+  mic->mbmi.intra_filter = best_filter;
+#endif  // CONFIG_EXT_INTRA
+  mic->mbmi.tx_type = best_tx_type;
 #if CONFIG_PALETTE
   pmi->palette_size[0] = palette_mode_info.palette_size[0];
   if (palette_mode_info.palette_size[0] > 0) {
@@ -1827,7 +3037,6 @@
 static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, int *rate,
                             int64_t *distortion, int *skippable, int64_t *sse,
                             BLOCK_SIZE bsize, int64_t ref_best_rd) {
-  const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
@@ -1850,7 +3059,7 @@
 
   if (is_cost_valid) {
     for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-      txfm_rd_in_plane(cm, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
+      txfm_rd_in_plane(x, cpi, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
                        plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing);
       if (pnrate == INT_MAX) {
         is_cost_valid = 0;
@@ -1860,6 +3069,11 @@
       *distortion += pndist;
       *sse += pnsse;
       *skippable &= pnskip;
+      if (RDCOST(x->rdmult, x->rddiv, *rate, *distortion) > ref_best_rd &&
+          RDCOST(x->rdmult, x->rddiv, 0, *sse) > ref_best_rd) {
+        is_cost_valid = 0;
+        break;
+      }
     }
   }
 
@@ -1874,6 +3088,687 @@
   return is_cost_valid;
 }
 
+#if CONFIG_VAR_TX
+void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                       int blk_row, int blk_col, int plane, int block,
+                       int plane_bsize, int coeff_ctx, RD_STATS *rd_stats) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int64_t tmp;
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
+  int bh = block_size_high[txm_bsize];
+  int bw = block_size_wide[txm_bsize];
+  int txb_h = tx_size_high_unit[tx_size];
+  int txb_w = tx_size_wide_unit[tx_size];
+
+  int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
+  uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+#if CONFIG_AOM_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
+  uint8_t *rec_buffer;
+#else
+  DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  int max_blocks_high = block_size_high[plane_bsize];
+  int max_blocks_wide = block_size_wide[plane_bsize];
+  const int diff_stride = max_blocks_wide;
+  const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+  int txb_coeff_cost;
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+  max_blocks_high >>= tx_size_wide_log2[0];
+  max_blocks_wide >>= tx_size_wide_log2[0];
+
+#if CONFIG_NEW_QUANT
+  av1_xform_quant_fp_nuq(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+                         tx_size, coeff_ctx);
+#else
+  av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                  AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+
+  av1_optimize_b(cm, x, plane, block, tx_size, coeff_ctx);
+
+// TODO(any): Use dist_block to compute distortion
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
+    aom_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL,
+                             0, NULL, 0, bw, bh, xd->bd);
+  } else {
+    rec_buffer = (uint8_t *)rec_buffer16;
+    aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0,
+                      NULL, 0, bw, bh);
+  }
+#else
+  aom_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE, NULL, 0, NULL,
+                    0, bw, bh);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  if (blk_row + txb_h > max_blocks_high || blk_col + txb_w > max_blocks_wide) {
+    int idx, idy;
+    int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+    int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
+    tmp = 0;
+    for (idy = 0; idy < blocks_height; idy += 2) {
+      for (idx = 0; idx < blocks_width; idx += 2) {
+        const int16_t *d = diff + 4 * idy * diff_stride + 4 * idx;
+        tmp += aom_sum_squares_2d_i16(d, diff_stride, 8);
+      }
+    }
+  } else {
+    tmp = sum_squares_2d(diff, diff_stride, tx_size);
+  }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+  rd_stats->sse += tmp * 16;
+
+  if (p->eobs[block] > 0) {
+    INV_TXFM_PARAM inv_txfm_param;
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = p->eobs[block];
+    inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      inv_txfm_param.bd = xd->bd;
+      highbd_inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
+    } else {
+      inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
+    }
+#else   // CONFIG_AOM_HIGHBITDEPTH
+    inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+    if (txb_w + blk_col > max_blocks_wide ||
+        txb_h + blk_row > max_blocks_high) {
+      int idx, idy;
+      unsigned int this_dist;
+      int blocks_height = AOMMIN(txb_h, max_blocks_high - blk_row);
+      int blocks_width = AOMMIN(txb_w, max_blocks_wide - blk_col);
+      tmp = 0;
+      for (idy = 0; idy < blocks_height; idy += 2) {
+        for (idx = 0; idx < blocks_width; idx += 2) {
+          uint8_t *const s = src + 4 * idy * src_stride + 4 * idx;
+          uint8_t *const r = rec_buffer + 4 * idy * MAX_TX_SIZE + 4 * idx;
+          cpi->fn_ptr[BLOCK_8X8].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
+          tmp += this_dist;
+        }
+      }
+    } else {
+      uint32_t this_dist;
+      cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
+                                &this_dist);
+      tmp = this_dist;
+    }
+  }
+  rd_stats->dist += tmp * 16;
+  txb_coeff_cost = av1_cost_coeffs(cm, x, plane, block, coeff_ctx, tx_size,
+                                   scan_order->scan, scan_order->neighbors, 0);
+  rd_stats->rate += txb_coeff_cost;
+  rd_stats->skip &= (p->eobs[block] == 0);
+#if CONFIG_RD_DEBUG
+  {
+    int idx, idy;
+    rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+
+    for (idy = 0; idy < txb_h; ++idy)
+      for (idx = 0; idx < txb_w; ++idx)
+        rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
+
+    rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
+
+    assert(blk_row < 16);
+    assert(blk_col < 16);
+  }
+#endif
+}
+
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                            int blk_col, int plane, int block, TX_SIZE tx_size,
+                            int depth, BLOCK_SIZE plane_bsize,
+                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                            RD_STATS *rd_stats, int64_t ref_best_rd,
+                            int *is_cost_valid) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE(*const inter_tx_size)
+  [MAX_MIB_SIZE] =
+      (TX_SIZE(*)[MAX_MIB_SIZE]) & mbmi->inter_tx_size[tx_row][tx_col];
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+  int64_t this_rd = INT64_MAX;
+  ENTROPY_CONTEXT *pta = ta + blk_col;
+  ENTROPY_CONTEXT *ptl = tl + blk_row;
+  int coeff_ctx, i;
+  int ctx =
+      txfm_partition_context(tx_above + (blk_col >> 1),
+                             tx_left + (blk_row >> 1), mbmi->sb_type, tx_size);
+
+  int64_t sum_rd = INT64_MAX;
+  int tmp_eob = 0;
+  int zero_blk_rate;
+  RD_STATS sum_rd_stats;
+  av1_init_rd_stats(&sum_rd_stats);
+
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  coeff_ctx = get_entropy_context(tx_size, pta, ptl);
+
+  av1_init_rd_stats(rd_stats);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  zero_blk_rate =
+      x->token_costs[tx_size][pd->plane_type][1][0][0][coeff_ctx][EOB_TOKEN];
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
+    inter_tx_size[0][0] = tx_size;
+    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                      plane_bsize, coeff_ctx, rd_stats);
+
+    if ((RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist) >=
+             RDCOST(x->rdmult, x->rddiv, zero_blk_rate, rd_stats->sse) ||
+         rd_stats->skip == 1) &&
+        !xd->lossless[mbmi->segment_id]) {
+      rd_stats->rate = zero_blk_rate;
+      rd_stats->dist = rd_stats->sse;
+      rd_stats->skip = 1;
+      x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+      p->eobs[block] = 0;
+    } else {
+      x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+      rd_stats->skip = 0;
+    }
+
+    if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+      rd_stats->rate +=
+          av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+    this_rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist);
+    tmp_eob = p->eobs[block];
+  }
+
+  if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int sub_step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    RD_STATS this_rd_stats;
+    int this_cost_valid = 1;
+    int64_t tmp_rd = 0;
+
+    sum_rd_stats.rate =
+        av1_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+#if CONFIG_EXT_TX
+    assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+    for (i = 0; i < 4 && this_cost_valid; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      select_tx_block(cpi, x, offsetr, offsetc, plane, block, sub_txs,
+                      depth + 1, plane_bsize, ta, tl, tx_above, tx_left,
+                      &this_rd_stats, ref_best_rd - tmp_rd, &this_cost_valid);
+
+      av1_merge_rd_stats(&sum_rd_stats, &this_rd_stats);
+
+      tmp_rd =
+          RDCOST(x->rdmult, x->rddiv, sum_rd_stats.rate, sum_rd_stats.dist);
+      if (this_rd < tmp_rd) break;
+      block += sub_step;
+    }
+    if (this_cost_valid) sum_rd = tmp_rd;
+  }
+
+  if (this_rd < sum_rd) {
+    int idx, idy;
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i) pta[i] = !(tmp_eob == 0);
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i) ptl[i] = !(tmp_eob == 0);
+    txfm_partition_update(tx_above + (blk_col >> 1), tx_left + (blk_row >> 1),
+                          tx_size);
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < tx_size_high_unit[tx_size] / 2; ++idy)
+      for (idx = 0; idx < tx_size_wide_unit[tx_size] / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    if (this_rd == INT64_MAX) *is_cost_valid = 0;
+    x->blk_skip[plane][blk_row * bw + blk_col] = rd_stats->skip;
+  } else {
+    *rd_stats = sum_rd_stats;
+    if (sum_rd == INT64_MAX) *is_cost_valid = 0;
+  }
+}
+
+static void inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+
+    RD_STATS pn_rd_stats;
+    av1_init_rd_stats(&pn_rd_stats);
+
+    av1_get_entropy_contexts(bsize, TX_4X4, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_width >> 1));
+    memcpy(tx_left, xd->left_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_height >> 1));
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        select_tx_block(cpi, x, idy, idx, 0, block, max_tx_size,
+                        mi_height != mi_width, plane_bsize, ctxa, ctxl,
+                        tx_above, tx_left, &pn_rd_stats, ref_best_rd - this_rd,
+                        &is_cost_valid);
+        av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+        this_rd += AOMMIN(
+            RDCOST(x->rdmult, x->rddiv, pn_rd_stats.rate, pn_rd_stats.dist),
+            RDCOST(x->rdmult, x->rddiv, 0, pn_rd_stats.sse));
+        block += step;
+      }
+    }
+  }
+
+  this_rd = AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+                   RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+  if (this_rd > ref_best_rd) is_cost_valid = 0;
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+}
+
+static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
+                                       RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd, TX_TYPE tx_type) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_inter = is_inter_block(mbmi);
+  aom_prob skip_prob = av1_get_skip_prob(cm, xd);
+  int s0 = av1_cost_bit(skip_prob, 0);
+  int s1 = av1_cost_bit(skip_prob, 1);
+  int64_t rd;
+  int row, col;
+  const int max_blocks_high = max_block_high(xd, bsize, 0);
+  const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+
+  mbmi->tx_type = tx_type;
+  mbmi->min_tx_size = TX_SIZES_ALL;
+  inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd);
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (is_rect_tx_allowed(xd, mbmi)) {
+    RD_STATS rect_rd_stats;
+    int64_t rd_rect_tx;
+    int tx_size_cat = inter_tx_size_cat_lookup[bsize];
+    TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+    TX_SIZE var_tx_size = mbmi->tx_size;
+
+    txfm_rd_in_plane(x, cpi, &rect_rd_stats.rate, &rect_rd_stats.dist,
+                     &rect_rd_stats.skip, &rect_rd_stats.sse, ref_best_rd, 0,
+                     bsize, tx_size, cpi->sf.use_fast_coef_costing);
+
+    if (rd_stats->rate != INT_MAX) {
+      rd_stats->rate += av1_cost_bit(cm->fc->rect_tx_prob[tx_size_cat], 0);
+      if (rd_stats->skip) {
+        rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+      } else {
+        rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
+        if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+            !rd_stats->skip)
+          rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+      }
+    } else {
+      rd = INT64_MAX;
+    }
+
+    if (rect_rd_stats.rate != INT_MAX) {
+      rect_rd_stats.rate += av1_cost_bit(cm->fc->rect_tx_prob[tx_size_cat], 1);
+      if (rect_rd_stats.skip) {
+        rd_rect_tx = RDCOST(x->rdmult, x->rddiv, s1, rect_rd_stats.sse);
+      } else {
+        rd_rect_tx = RDCOST(x->rdmult, x->rddiv, rect_rd_stats.rate + s0,
+                            rect_rd_stats.dist);
+        if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+            !(rect_rd_stats.skip))
+          rd_rect_tx = AOMMIN(
+              rd_rect_tx, RDCOST(x->rdmult, x->rddiv, s1, rect_rd_stats.sse));
+      }
+    } else {
+      rd_rect_tx = INT64_MAX;
+    }
+
+    if (rd_rect_tx < rd) {
+      *rd_stats = rect_rd_stats;
+      if (!xd->lossless[mbmi->segment_id]) x->blk_skip[0][0] = rd_stats->skip;
+      mbmi->tx_size = tx_size;
+      mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+    } else {
+      mbmi->tx_size = var_tx_size;
+    }
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+  for (row = 0; row < max_blocks_high / 2; ++row)
+    for (col = 0; col < max_blocks_wide / 2; ++col)
+      mbmi->min_tx_size = AOMMIN(
+          mbmi->min_tx_size, get_min_tx_size(mbmi->inter_tx_size[row][col]));
+
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->min_tx_size, bsize, is_inter) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    int ext_tx_set = get_ext_tx_set(mbmi->min_tx_size, bsize, is_inter);
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        rd_stats->rate +=
+            cpi->inter_tx_type_costs[ext_tx_set]
+                                    [txsize_sqr_map[mbmi->min_tx_size]]
+                                    [mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        rd_stats->rate +=
+            cpi->intra_tx_type_costs[ext_tx_set][mbmi->min_tx_size][mbmi->mode]
+                                    [mbmi->tx_type];
+    }
+  }
+#else   // CONFIG_EXT_TX
+  if (mbmi->min_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id])
+    rd_stats->rate +=
+        cpi->inter_tx_type_costs[mbmi->min_tx_size][mbmi->tx_type];
+#endif  // CONFIG_EXT_TX
+
+  if (rd_stats->skip)
+    rd = RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse);
+  else
+    rd = RDCOST(x->rdmult, x->rddiv, rd_stats->rate + s0, rd_stats->dist);
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+      !(rd_stats->skip))
+    rd = AOMMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, rd_stats->sse));
+
+  return rd;
+}
+
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                               RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                               int64_t ref_best_rd) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  const int is_inter = is_inter_block(mbmi);
+  TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  TX_SIZE best_tx = max_txsize_lookup[bsize];
+  TX_SIZE best_min_tx_size = TX_SIZES_ALL;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+  const int n4 = 1 << (num_pels_log2_lookup[bsize] - 4);
+  int idx, idy;
+  int prune = 0;
+#if CONFIG_EXT_TX
+  int ext_tx_set = get_ext_tx_set(max_tx_size, bsize, is_inter);
+#endif  // CONFIG_EXT_TX
+
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+    prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
+#else
+    prune = prune_tx_types(cpi, bsize, x, xd, 0);
+#endif
+
+  av1_invalid_rd_stats(rd_stats);
+
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+    RD_STATS this_rd_stats;
+    av1_init_rd_stats(&this_rd_stats);
+#if CONFIG_EXT_TX
+    if (is_inter) {
+      if (!ext_tx_used_inter[ext_tx_set][tx_type]) continue;
+      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+        if (!do_tx_type_search(tx_type, prune)) continue;
+      }
+    } else {
+      if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode]) continue;
+      }
+      if (!ext_tx_used_intra[ext_tx_set][tx_type]) continue;
+    }
+#else   // CONFIG_EXT_TX
+    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+        !do_tx_type_search(tx_type, prune))
+      continue;
+#endif  // CONFIG_EXT_TX
+    if (is_inter && x->use_default_inter_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
+      continue;
+
+    if (xd->lossless[mbmi->segment_id])
+      if (tx_type != DCT_DCT) continue;
+
+    rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+                                 tx_type);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *rd_stats = this_rd_stats;
+      best_tx_type = mbmi->tx_type;
+      best_tx = mbmi->tx_size;
+      best_min_tx_size = mbmi->min_tx_size;
+      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+    }
+  }
+
+  mbmi->tx_type = best_tx_type;
+  for (idy = 0; idy < xd->n8_h; ++idy)
+    for (idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
+  mbmi->tx_size = best_tx;
+  mbmi->min_tx_size = best_min_tx_size;
+#if CONFIG_RD_DEBUG
+  // record plane y's transform block coefficient cost
+  mbmi->rd_stats = *rd_stats;
+#endif
+  memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+}
+
+static void tx_block_rd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+                        int blk_col, int plane, int block, TX_SIZE tx_size,
+                        BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
+                        ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE plane_tx_size;
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+#if CONFIG_EXT_TX
+  assert(tx_size < TX_SIZES);
+#endif  // CONFIG_EXT_TX
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    int coeff_ctx, i;
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+    coeff_ctx = get_entropy_context(tx_size, ta, tl);
+    av1_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                      plane_bsize, coeff_ctx, rd_stats);
+
+    for (i = 0; i < tx_size_wide_unit[tx_size]; ++i)
+      ta[i] = !(p->eobs[block] == 0);
+    for (i = 0; i < tx_size_high_unit[tx_size]; ++i)
+      tl[i] = !(p->eobs[block] == 0);
+  } else {
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + (i >> 1) * bsl;
+      int offsetc = blk_col + (i & 0x01) * bsl;
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      tx_block_rd(cpi, x, offsetr, offsetc, plane, block, sub_txs, plane_bsize,
+                  above_ctx, left_ctx, rd_stats);
+      block += step;
+    }
+  }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+                            RD_STATS *rd_stats, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int plane;
+  int is_cost_valid = 1;
+  int64_t this_rd;
+
+  if (ref_best_rd < 0) is_cost_valid = 0;
+
+  av1_init_rd_stats(rd_stats);
+
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  if (is_rect_tx(mbmi->tx_size)) {
+    return super_block_uvrd(cpi, x, &rd_stats->rate, &rd_stats->dist,
+                            &rd_stats->skip, &rd_stats->sse, bsize,
+                            ref_best_rd);
+  }
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      av1_subtract_plane(x, bsize, plane);
+  }
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const int bh = tx_size_high_unit[max_tx_size];
+    const int bw = tx_size_wide_unit[max_tx_size];
+    int idx, idy;
+    int block = 0;
+    const int step = bh * bw;
+    ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
+    RD_STATS pn_rd_stats;
+    av1_init_rd_stats(&pn_rd_stats);
+
+    av1_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        tx_block_rd(cpi, x, idy, idx, plane, block, max_tx_size, plane_bsize,
+                    ta, tl, &pn_rd_stats);
+        block += step;
+      }
+    }
+
+    if (pn_rd_stats.rate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+
+    av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+
+    this_rd =
+        AOMMIN(RDCOST(x->rdmult, x->rddiv, rd_stats->rate, rd_stats->dist),
+               RDCOST(x->rdmult, x->rddiv, 0, rd_stats->sse));
+
+    if (this_rd > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    av1_invalid_rd_stats(rd_stats);
+  }
+
+  return is_cost_valid;
+}
+#endif  // CONFIG_VAR_TX
+
 #if CONFIG_PALETTE
 static void rd_pick_palette_intra_sbuv(
     const AV1_COMP *const cpi, MACROBLOCK *x, int dc_mode_cost,
@@ -1896,6 +3791,10 @@
 
   if (rows * cols > PALETTE_MAX_BLOCK_SIZE) return;
 
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+
 #if CONFIG_AOM_HIGHBITDEPTH
   if (cpi->common.use_highbitdepth) {
     colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
@@ -1942,6 +3841,9 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
     mbmi->uv_mode = DC_PRED;
+#if CONFIG_FILTER_INTRA
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
     for (r = 0; r < rows; ++r) {
       for (c = 0; c < cols; ++c) {
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -2029,85 +3931,154 @@
 }
 #endif  // CONFIG_PALETTE
 
+#if CONFIG_FILTER_INTRA
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                     int *rate, int *rate_tokenonly,
+                                     int64_t *distortion, int *skippable,
+                                     BLOCK_SIZE bsize, int64_t *best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int filter_intra_selected_flag = 0;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+  FILTER_INTRA_MODE mode;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+
+  av1_zero(filter_intra_mode_info);
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 1;
+  mbmi->uv_mode = DC_PRED;
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    mbmi->filter_intra_mode_info.filter_intra_mode[1] = mode;
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+                          &this_sse, bsize, *best_rd))
+      continue;
+
+    this_rate = this_rate_tokenonly +
+                av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 1) +
+                cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+                write_uniform_cost(FILTER_INTRA_MODES, mode);
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+    if (this_rd < *best_rd) {
+      *best_rd = this_rd;
+      *rate = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion = this_distortion;
+      *skippable = s;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+      filter_intra_selected_flag = 1;
+    }
+  }
+
+  if (filter_intra_selected_flag) {
+    mbmi->uv_mode = DC_PRED;
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+        filter_intra_mode_info.use_filter_intra_mode[1];
+    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+        filter_intra_mode_info.filter_intra_mode[1];
+    return 1;
+  } else {
+    return 0;
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
 #if CONFIG_EXT_INTRA
-static int64_t pick_intra_angle_routine_sbuv(
+static void pick_intra_angle_routine_sbuv(
     const AV1_COMP *const cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly,
-    int64_t *distortion, int *skippable, int8_t *best_angle_delta,
-    BLOCK_SIZE bsize, int rate_overhead, int64_t *best_rd, int64_t best_rd_in) {
+    int64_t *distortion, int *skippable, int *best_angle_delta,
+    BLOCK_SIZE bsize, int rate_overhead, int64_t *best_rd) {
   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse, this_rd;
 
   if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
-                        &this_sse, bsize, best_rd_in))
-    return INT64_MAX;
+                        &this_sse, bsize, *best_rd))
+    return;
 
   this_rate = this_rate_tokenonly + rate_overhead;
   this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
   if (this_rd < *best_rd) {
     *best_rd = this_rd;
-    *best_angle_delta = mbmi->intra_angle_delta[1];
+    *best_angle_delta = mbmi->angle_delta[1];
     *rate = this_rate;
     *rate_tokenonly = this_rate_tokenonly;
     *distortion = this_distortion;
     *skippable = s;
   }
-  return this_rd;
 }
 
-static int rd_pick_intra_angle_sbuv(const AV1_COMP *cpi, MACROBLOCK *x,
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
                                     int *rate, int *rate_tokenonly,
                                     int64_t *distortion, int *skippable,
                                     BLOCK_SIZE bsize, int rate_overhead,
                                     int64_t best_rd) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
-  int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
-  int8_t angle_delta, best_angle_delta = 0;
-  int i;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+  int angle_delta, best_angle_delta = 0;
+  const double rd_adjust = 1.2;
 
   *rate_tokenonly = INT_MAX;
-  *skippable = 0;
-  *distortion = INT64_MAX;
-  for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = { 0, -2, 2 };
+    int deltas_level2[3][2] = {
+      { -1, 1 }, { -3, -1 }, { 1, 3 },
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
 
-  for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA_UV; angle_delta += 2) {
-    for (i = 0; i < 2; ++i) {
-      best_rd_in = (best_rd == INT64_MAX)
-                       ? INT64_MAX
-                       : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
-      mbmi->intra_angle_delta[1] = (1 - 2 * i) * angle_delta;
-      this_rd = pick_intra_angle_routine_sbuv(
-          cpi, x, rate, rate_tokenonly, distortion, skippable,
-          &best_angle_delta, bsize, rate_overhead, &best_rd, best_rd_in);
-      rd_cost[2 * angle_delta + i] = this_rd;
-      if (angle_delta == 0) {
-        if (this_rd == INT64_MAX) return 0;
-        rd_cost[1] = this_rd;
-        break;
+    for (i = 0; i < level1; ++i) {
+      int64_t tmp_best_rd;
+      mbmi->angle_delta[1] = deltas_level1[i];
+      tmp_best_rd = (i == 0 && best_rd < INT64_MAX)
+                        ? (int64_t)(best_rd * rd_adjust)
+                        : best_rd;
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
+                            &this_sse, bsize, tmp_best_rd)) {
+        if (i == 0)
+          break;
+        else
+          continue;
       }
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (i == 0 && best_rd < INT64_MAX && this_rd > best_rd * rd_adjust) break;
+      if (this_rd < best_rd) {
+        best_i = i;
+        best_rd = this_rd;
+        best_angle_delta = mbmi->angle_delta[1];
+        *rate = this_rate;
+        *rate_tokenonly = this_rate_tokenonly;
+        *distortion = this_distortion;
+        *skippable = s;
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mbmi->angle_delta[1] = deltas_level2[best_i][j];
+        pick_intra_angle_routine_sbuv(cpi, x, rate, rate_tokenonly, distortion,
+                                      skippable, &best_angle_delta, bsize,
+                                      rate_overhead, &best_rd);
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+         ++angle_delta) {
+      mbmi->angle_delta[1] = angle_delta;
+      pick_intra_angle_routine_sbuv(cpi, x, rate, rate_tokenonly, distortion,
+                                    skippable, &best_angle_delta, bsize,
+                                    rate_overhead, &best_rd);
     }
   }
 
-  assert(best_rd != INT64_MAX);
-  for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA_UV; angle_delta += 2) {
-    int skip_search;
-    int64_t rd_thresh;
-    for (i = 0; i < 2; ++i) {
-      skip_search = 0;
-      rd_thresh = best_rd + (best_rd >> 5);
-      if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
-          rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
-        skip_search = 1;
-      if (!skip_search) {
-        mbmi->intra_angle_delta[1] = (1 - 2 * i) * angle_delta;
-        this_rd = pick_intra_angle_routine_sbuv(
-            cpi, x, rate, rate_tokenonly, distortion, skippable,
-            &best_angle_delta, bsize, rate_overhead, &best_rd, best_rd);
-      }
-    }
-  }
-
-  mbmi->intra_angle_delta[1] = best_angle_delta;
+  mbmi->angle_delta[1] = best_angle_delta;
   return *rate_tokenonly != INT_MAX;
 }
 #endif  // CONFIG_EXT_INTRA
@@ -2116,13 +4087,10 @@
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   PREDICTION_MODE mode;
   PREDICTION_MODE mode_selected = DC_PRED;
-#if CONFIG_EXT_INTRA
-  int8_t best_angle_delta = 0;
-  int rate_overhead;
-#endif  // CONFIG_EXT_INTRA
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
@@ -2132,7 +4100,6 @@
   od_encode_checkpoint(&x->daala_enc, &buf);
 #endif
 #if CONFIG_PALETTE
-  MACROBLOCKD *const xd = &x->e_mbd;
   const int rows =
       (4 * num_4x4_blocks_high_lookup[bsize]) >> (xd->plane[1].subsampling_y);
   const int cols =
@@ -2140,30 +4107,35 @@
   PALETTE_MODE_INFO palette_mode_info;
   PALETTE_MODE_INFO *const pmi = &xd->mi[0]->mbmi.palette_mode_info;
   uint8_t *best_palette_color_map = NULL;
+#endif  // CONFIG_PALETTE
+#if CONFIG_EXT_INTRA
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info;
 
+  filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+#if CONFIG_PALETTE
   palette_mode_info.palette_size[1] = 0;
   pmi->palette_size[1] = 0;
 #endif  // CONFIG_PALETTE
-
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode))) continue;
 
     mbmi->uv_mode = mode;
-
 #if CONFIG_EXT_INTRA
+    is_directional_mode = (mode != DC_PRED && mode != TM_PRED);
     rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
-                    write_uniform_cost(2 * MAX_ANGLE_DELTA_UV + 1, 0);
-    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode(mbmi->uv_mode)) {
+                    write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
+    mbmi->angle_delta[1] = 0;
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode) {
       if (!rd_pick_intra_angle_sbuv(cpi, x, &this_rate, &this_rate_tokenonly,
                                     &this_distortion, &s, bsize, rate_overhead,
                                     best_rd))
         continue;
-      rate_overhead =
-          cpi->intra_uv_mode_cost[mbmi->mode][mode] +
-          write_uniform_cost(2 * MAX_ANGLE_DELTA_UV + 1,
-                             MAX_ANGLE_DELTA_UV + mbmi->intra_angle_delta[1]);
     } else {
-      mbmi->intra_angle_delta[1] = 0;
       if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
                             &this_sse, bsize, best_rd)) {
 #if CONFIG_PVQ
@@ -2171,9 +4143,11 @@
 #endif
         continue;
       }
-      rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode];
     }
-    this_rate = this_rate_tokenonly + rate_overhead;
+    this_rate = this_rate_tokenonly + cpi->intra_uv_mode_cost[mbmi->mode][mode];
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode)
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                      MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
 #else
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s,
                           &this_sse, bsize, best_rd)) {
@@ -2184,7 +4158,10 @@
     }
     this_rate = this_rate_tokenonly + cpi->intra_uv_mode_cost[mbmi->mode][mode];
 #endif  // CONFIG_EXT_INTRA
-
+#if CONFIG_FILTER_INTRA
+    if (mbmi->sb_type >= BLOCK_8X8 && mode == DC_PRED)
+      this_rate += av1_cost_bit(cpi->common.fc->filter_intra_probs[1], 0);
+#endif  // CONFIG_FILTER_INTRA
 #if CONFIG_PALETTE
     if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
         mode == DC_PRED)
@@ -2211,7 +4188,7 @@
     if (this_rd < best_rd) {
       mode_selected = mode;
 #if CONFIG_EXT_INTRA
-      best_angle_delta = mbmi->intra_angle_delta[1];
+      best_angle_delta = mbmi->angle_delta[1];
 #endif  // CONFIG_EXT_INTRA
       best_rd = this_rd;
       *rate = this_rate;
@@ -2229,6 +4206,33 @@
         &palette_mode_info, best_palette_color_map, &mode_selected, &best_rd,
         rate, rate_tokenonly, distortion, skippable);
   }
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+  if (mbmi->sb_type >= BLOCK_8X8) {
+    if (rd_pick_filter_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
+                                  skippable, bsize, &best_rd)) {
+      mode_selected = mbmi->uv_mode;
+      filter_intra_mode_info = mbmi->filter_intra_mode_info;
+    }
+  }
+
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+      filter_intra_mode_info.use_filter_intra_mode[1];
+  if (filter_intra_mode_info.use_filter_intra_mode[1]) {
+    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+        filter_intra_mode_info.filter_intra_mode[1];
+#if CONFIG_PALETTE
+    palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+  }
+#endif  // CONFIG_FILTER_INTRA
+
+#if CONFIG_EXT_INTRA
+  mbmi->angle_delta[1] = best_angle_delta;
+#endif  // CONFIG_EXT_INTRA
+  mbmi->uv_mode = mode_selected;
+#if CONFIG_PALETTE
   pmi->palette_size[1] = palette_mode_info.palette_size[1];
   if (palette_mode_info.palette_size[1] > 0) {
     memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
@@ -2238,71 +4242,134 @@
            rows * cols * sizeof(best_palette_color_map[0]));
   }
 #endif  // CONFIG_PALETTE
-  mbmi->uv_mode = mode_selected;
-#if CONFIG_EXT_INTRA
-  mbmi->intra_angle_delta[1] = best_angle_delta;
-#endif  // CONFIG_EXT_INTRA
+
   return best_rd;
 }
 
 static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
-                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
-                                 int *rate_uv, int *rate_uv_tokenonly,
-                                 int64_t *dist_uv, int *skip_uv,
-                                 PREDICTION_MODE *mode_uv) {
+                                 PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+                                 TX_SIZE max_tx_size, int *rate_uv,
+                                 int *rate_uv_tokenonly, int64_t *dist_uv,
+                                 int *skip_uv, PREDICTION_MODE *mode_uv) {
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
+  (void)ctx;
   rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                           bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
 }
 
 static int cost_mv_ref(const AV1_COMP *const cpi, PREDICTION_MODE mode,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                       int is_compound,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
                        int16_t mode_context) {
 #if CONFIG_REF_MV
   int mode_cost = 0;
+#if CONFIG_EXT_INTER
+  int16_t mode_ctx =
+      is_compound ? mode_context : (mode_context & NEWMV_CTX_MASK);
+#else
   int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+#endif  // CONFIG_EXT_INTER
   int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
 
   assert(is_inter_mode(mode));
 
-  if (mode == NEWMV) {
-    mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
-    return mode_cost;
+#if CONFIG_EXT_INTER
+  if (is_compound) {
+    return cpi->inter_compound_mode_cost[mode_context]
+                                        [INTER_COMPOUND_OFFSET(mode)];
   } else {
-    mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
-    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
-
-    if (is_all_zero_mv) return mode_cost;
-
-    if (mode == ZEROMV) {
-      mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+    if (mode == NEWMV || mode == NEWFROMNEARMV) {
+#else
+  if (mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
+      mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
+#if CONFIG_EXT_INTER
+      if (!is_compound)
+        mode_cost += cpi->new2mv_mode_cost[mode == NEWFROMNEARMV];
+#endif  // CONFIG_EXT_INTER
       return mode_cost;
     } else {
-      mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
-      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
 
-      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
-      if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
-      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+      if (is_all_zero_mv) return mode_cost;
 
-      mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
-      return mode_cost;
+      if (mode == ZEROMV) {
+        mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+        return mode_cost;
+      } else {
+        mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
+        mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+        if (mode_context & (1 << SKIP_NEARESTMV_OFFSET)) mode_ctx = 6;
+        if (mode_context & (1 << SKIP_NEARMV_OFFSET)) mode_ctx = 7;
+        if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET)) mode_ctx = 8;
+
+        mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+        return mode_cost;
+      }
     }
+#if CONFIG_EXT_INTER
   }
+#endif  // CONFIG_EXT_INTER
 #else
   assert(is_inter_mode(mode));
-  return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+#if CONFIG_EXT_INTER
+  if (is_inter_compound_mode(mode)) {
+    return cpi->inter_compound_mode_cost[mode_context]
+                                        [INTER_COMPOUND_OFFSET(mode)];
+  } else {
+#endif  // CONFIG_EXT_INTER
+    return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+#if CONFIG_EXT_INTER
+  }
+#endif  // CONFIG_EXT_INTER
 #endif
 }
 
+#if CONFIG_GLOBAL_MOTION
+static int get_gmbitcost(const Global_Motion_Params *gm,
+                         const aom_prob *probs) {
+  int gmtype_cost[GLOBAL_MOTION_TYPES];
+  int bits;
+  av1_cost_tokens(gmtype_cost, probs, av1_global_motion_types_tree);
+  if (gm->motion_params.wmmat[5] || gm->motion_params.wmmat[4]) {
+    bits = (GM_ABS_TRANS_BITS + 1) * 2 + 4 * GM_ABS_ALPHA_BITS + 4;
+  } else if (gm->motion_params.wmmat[3] || gm->motion_params.wmmat[2]) {
+    bits = (GM_ABS_TRANS_BITS + 1) * 2 + 2 * GM_ABS_ALPHA_BITS + 2;
+  } else {
+    bits = ((gm->motion_params.wmmat[1] || gm->motion_params.wmmat[0])
+                ? ((GM_ABS_TRANS_BITS + 1) * 2)
+                : 0);
+  }
+  return bits ? (bits << AV1_PROB_COST_SHIFT) + gmtype_cost[gm->gmtype] : 0;
+}
+
+#define GLOBAL_MOTION_RATE(ref)                            \
+  (cpi->global_motion_used[ref] >= 2                       \
+       ? 0                                                 \
+       : get_gmbitcost(&cm->global_motion[(ref)],          \
+                       cm->fc->global_motion_types_prob) / \
+             2);
+#endif  // CONFIG_GLOBAL_MOTION
+
 static int set_and_cost_bmi_mvs(const AV1_COMP *const cpi, MACROBLOCK *x,
                                 MACROBLOCKD *xd, int i, PREDICTION_MODE mode,
                                 int_mv this_mv[2],
-                                int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-                                int_mv seg_mvs[MAX_REF_FRAMES],
+                                int_mv frame_mv[MB_MODE_COUNT]
+                                               [TOTAL_REFS_PER_FRAME],
+                                int_mv seg_mvs[TOTAL_REFS_PER_FRAME],
+#if CONFIG_EXT_INTER
+                                int_mv compound_seg_newmvs[2],
+#endif  // CONFIG_EXT_INTER
                                 int_mv *best_ref_mv[2], const int *mvjcost,
                                 int *mvcost[2]) {
+#if CONFIG_GLOBAL_MOTION
+  const AV1_COMMON *cm = &cpi->common;
+#endif  // CONFIG_GLOBAL_MOTION
   MODE_INFO *const mic = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mic->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
@@ -2311,18 +4378,40 @@
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   const int is_compound = has_second_ref(mbmi);
-  int16_t mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+  int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
 
   switch (mode) {
     case NEWMV:
+#if CONFIG_EXT_INTER
+    case NEWFROMNEARMV:
+#endif  // CONFIG_EXT_INTER
       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+#if CONFIG_EXT_INTER
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_REF_MV
+      for (idx = 0; idx < 1 + is_compound; ++idx) {
+        this_mv[idx] = seg_mvs[mbmi->ref_frame[idx]];
+        av1_set_mvcost(x, mbmi->ref_frame[idx], idx, mbmi->ref_mv_idx);
+        thismvcost +=
+            av1_mv_bit_cost(&this_mv[idx].as_mv, &best_ref_mv[idx]->as_mv,
+                            x->nmvjointcost, x->mvcost, MV_COST_WEIGHT_SUB);
+      }
+      (void)mvjcost;
+      (void)mvcost;
+#else
       thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+#if !CONFIG_EXT_INTER
       if (is_compound) {
         this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
         thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       }
+#endif  // !CONFIG_EXT_INTER
+#endif
       break;
     case NEARMV:
     case NEARESTMV:
@@ -2331,9 +4420,71 @@
         this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
       break;
     case ZEROMV:
+#if CONFIG_GLOBAL_MOTION
+      this_mv[0].as_int =
+          gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[0]])
+              .as_int;
+      thismvcost += GLOBAL_MOTION_RATE(mbmi->ref_frame[0]);
+      if (is_compound) {
+        this_mv[1].as_int =
+            gm_get_motion_vector(&cpi->common.global_motion[mbmi->ref_frame[1]])
+                .as_int;
+        thismvcost += GLOBAL_MOTION_RATE(mbmi->ref_frame[1]);
+      }
+#else   // CONFIG_GLOBAL_MOTION
       this_mv[0].as_int = 0;
       if (is_compound) this_mv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
       break;
+#if CONFIG_EXT_INTER
+    case NEW_NEWMV:
+      if (compound_seg_newmvs[0].as_int == INVALID_MV ||
+          compound_seg_newmvs[1].as_int == INVALID_MV) {
+        this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+      } else {
+        this_mv[0].as_int = compound_seg_newmvs[0].as_int;
+        this_mv[1].as_int = compound_seg_newmvs[1].as_int;
+      }
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[1].as_mv, 0);
+      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      break;
+    case NEW_NEARMV:
+    case NEW_NEARESTMV:
+      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+      thismvcost += av1_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+      break;
+    case NEAR_NEWMV:
+    case NEAREST_NEWMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+      if (!cpi->common.allow_high_precision_mv)
+        lower_mv_precision(&this_mv[1].as_mv, 0);
+      thismvcost += av1_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      break;
+    case NEAREST_NEARMV:
+    case NEAR_NEARESTMV:
+    case NEAREST_NEARESTMV:
+    case NEAR_NEARMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+      break;
+    case ZERO_ZEROMV:
+      this_mv[0].as_int = 0;
+      this_mv[1].as_int = 0;
+      break;
+#endif  // CONFIG_EXT_INTER
     default: break;
   }
 
@@ -2360,18 +4511,26 @@
       memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
 
 #if CONFIG_REF_MV
-  mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame,
-                                       mbmi->sb_type, i);
+#if CONFIG_EXT_INTER
+  if (is_compound)
+    mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                         mbmi->ref_frame, mbmi->sb_type, i);
 #endif
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+  return cost_mv_ref(cpi, mode, is_compound, mode_ctx) + thismvcost;
+#else
   return cost_mv_ref(cpi, mode, mode_ctx) + thismvcost;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
 }
 
 static int64_t encode_inter_mb_segment(const AV1_COMP *const cpi, MACROBLOCK *x,
-                                       int64_t best_yrd, int block,
-                                       int *labelyrate, int64_t *distortion,
-                                       int64_t *sse, ENTROPY_CONTEXT *ta,
-                                       ENTROPY_CONTEXT *tl, int ir, int ic,
-                                       int mi_row, int mi_col) {
+                                       int64_t best_yrd, int i, int *labelyrate,
+                                       int64_t *distortion, int64_t *sse,
+                                       ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                                       int ir, int ic, int mi_row, int mi_col) {
 #if !CONFIG_PVQ
   const AV1_COMMON *const cm = &cpi->common;
 #endif
@@ -2381,80 +4540,94 @@
   struct macroblock_plane *const p = &x->plane[0];
   MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
-  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int width = block_size_wide[plane_bsize];
+  const int height = block_size_high[plane_bsize];
   int idx, idy;
-  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   const uint8_t *const src =
-      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, block, p->src.stride)];
+      &p->src.buf[av1_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
   uint8_t *const dst =
-      &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, block, pd->dst.stride)];
+      &pd->dst.buf[av1_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
+  TX_SIZE tx_size = mi->mbmi.tx_size;
+
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, tx_size);
+  const int num_4x4_w = tx_size_wide_unit[tx_size];
+  const int num_4x4_h = tx_size_high_unit[tx_size];
 #if !CONFIG_PVQ
-  const SCAN_ORDER *scan_order = get_scan(cm, TX_4X4, tx_type);
+  const SCAN_ORDER *scan_order = get_scan(cm, tx_size, tx_type, 1);
 #else
   (void)cpi;
   (void)ta;
   (void)tl;
 #endif
 
-  av1_build_inter_predictor_sub8x8(xd, 0, block, ir, ic, mi_row, mi_col);
-
-#if CONFIG_AOM_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? av1_highbd_fwht4x4
-                                                   : aom_highbd_fdct4x4;
-  } else {
-    fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? av1_fwht4x4 : aom_fdct4x4;
-  }
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  assert(IMPLIES(xd->lossless[mi->mbmi.segment_id], tx_size == TX_4X4));
+  assert(IMPLIES(!xd->lossless[mi->mbmi.segment_id],
+                 tx_size == max_txsize_rect_lookup[mi->mbmi.sb_type]));
 #else
-  fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? av1_fwht4x4 : aom_fdct4x4;
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+  assert(tx_size == TX_4X4);
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+  assert(tx_type == DCT_DCT);
+
+  av1_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
 
 #if !CONFIG_PVQ
 #if CONFIG_AOM_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     aom_highbd_subtract_block(
-        height, width,
-        av1_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff), 8, src,
-        p->src.stride, dst, pd->dst.stride, xd->bd);
+        height, width, av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
   } else {
-    aom_subtract_block(height, width, av1_raster_block_offset_int16(
-                                          BLOCK_8X8, block, p->src_diff),
+    aom_subtract_block(height, width,
+                       av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
                        8, src, p->src.stride, dst, pd->dst.stride);
   }
 #else
-  aom_subtract_block(height, width, av1_raster_block_offset_int16(
-                                        BLOCK_8X8, block, p->src_diff),
+  aom_subtract_block(height, width,
+                     av1_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
                      8, src, p->src.stride, dst, pd->dst.stride);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // !CONFIG_PVQ
 
-  k = block;
-  for (idy = 0; idy < height / 4; ++idy) {
-    for (idx = 0; idx < width / 4; ++idx) {
-      int64_t ssz, rd, rd1, rd2;
-      tran_low_t *coeff;
-#if CONFIG_PVQ
+  k = i;
+  for (idy = 0; idy < height / 4; idy += num_4x4_h) {
+    for (idx = 0; idx < width / 4; idx += num_4x4_w) {
+      int64_t dist, ssz, rd, rd1, rd2;
+      int block;
+#if !CONFIG_PVQ
+      int coeff_ctx;
+#else
       const int src_stride = p->src.stride;
       const int dst_stride = pd->dst.stride;
       const int diff_stride = 8;
+      tran_low_t *coeff;
       tran_low_t *dqcoeff;
       tran_low_t *ref_coeff;
       int16_t *pred = &pd->pred[4 * (ir * diff_stride + ic)];
       int16_t *src_int16 = &p->src_int16[4 * (ir * diff_stride + ic)];
-      int i, j, tx_blk_size;
+      int ii, j, tx_blk_size;
       int rate_pvq;
 #endif
       k += (idy * 2 + idx);
-      coeff = BLOCK_OFFSET(p->coeff, k);
+      if (tx_size == TX_4X4)
+        block = k;
+      else
+        block = (i ? 2 : 0);
 #if !CONFIG_PVQ
-      fwd_txm4x4(av1_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
-                 coeff, 8);
-      av1_regular_quantize_b_4x4(x, 0, k, scan_order->scan, scan_order->iscan);
+      coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)), *(tl + (k >> 1)));
+#if CONFIG_NEW_QUANT
+      av1_xform_quant_fp_nuq(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
+                             BLOCK_8X8, tx_size, coeff_ctx);
 #else
+      av1_xform_quant(cm, x, 0, block, idy + (i >> 1), idx + (i & 0x01),
+                      BLOCK_8X8, tx_size, AV1_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
+        av1_optimize_b(cm, x, 0, block, tx_size, coeff_ctx);
+#else
+      coeff = BLOCK_OFFSET(p->coeff, k);
       dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
       ref_coeff = BLOCK_OFFSET(pd->pvq_ref_coeff, k);
 
@@ -2464,54 +4637,60 @@
       // copy uint8 orig and predicted block to int16 buffer
       // in order to use existing VP10 transform functions
       for (j = 0; j < tx_blk_size; j++)
-        for (i = 0; i < tx_blk_size; i++) {
-          src_int16[diff_stride * j + i] =
-              src[src_stride * (j + 4 * idy) + (i + 4 * idx)];
-          pred[diff_stride * j + i] =
-              dst[dst_stride * (j + 4 * idy) + (i + 4 * idx)];
+        for (ii = 0; ii < tx_blk_size; ii++) {
+          src_int16[diff_stride * j + ii] =
+              src[src_stride * (j + 4 * idy) + (ii + 4 * idx)];
+          pred[diff_stride * j + ii] =
+              dst[dst_stride * (j + 4 * idy) + (ii + 4 * idx)];
         }
 
-      fwd_txm4x4(src_int16, coeff, diff_stride);
-      fwd_txm4x4(pred, ref_coeff, diff_stride);
+      {
+        FWD_TXFM_PARAM fwd_txfm_param;
+        fwd_txfm_param.tx_type = DCT_DCT;
+        fwd_txfm_param.tx_size = TX_4X4;
+        fwd_txfm_param.fwd_txfm_opt = FWD_TXFM_OPT_NORMAL;
+        fwd_txfm_param.rd_transform = 0;
+        fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
 
+        fwd_txfm(src_int16, coeff, diff_stride, &fwd_txfm_param);
+        fwd_txfm(pred, ref_coeff, diff_stride, &fwd_txfm_param);
+      }
       av1_pvq_encode_helper(&x->daala_enc, coeff, ref_coeff, dqcoeff,
                             &p->eobs[k], pd->dequant, 0, TX_4X4, tx_type,
                             &rate_pvq, x->pvq_speed, NULL);
 #endif
 
-#if CONFIG_AOM_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        thisdistortion += av1_highbd_block_error(
-            coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, xd->bd);
-      } else {
-        thisdistortion +=
-            av1_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
-      }
-#elif CONFIG_PVQ
-      thisdistortion += av1_block_error2_c(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
-                                           ref_coeff, 16, &ssz);
-#else
-      thisdistortion +=
-          av1_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+      dist_block(cpi, x, 0, block, idy + (i >> 1), idx + (i & 0x1), tx_size,
+                 &dist, &ssz);
+      thisdistortion += dist;
       thissse += ssz;
 #if !CONFIG_PVQ
-      thisrate += cost_coeffs(cm, x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
-                              scan_order->scan, scan_order->neighbors,
-                              cpi->sf.use_fast_coef_costing);
+      thisrate +=
+          av1_cost_coeffs(cm, x, 0, block, coeff_ctx, tx_size, scan_order->scan,
+                          scan_order->neighbors, cpi->sf.use_fast_coef_costing);
 #else
       thisrate += rate_pvq;
 #endif
-      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
-      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
+      *(ta + (k & 1)) = !(p->eobs[block] == 0);
+      *(tl + (k >> 1)) = !(p->eobs[block] == 0);
+#if CONFIG_EXT_TX
+      if (tx_size == TX_8X4) {
+        *(ta + (k & 1) + 1) = *(ta + (k & 1));
+      }
+      if (tx_size == TX_4X8) {
+        *(tl + (k >> 1) + 1) = *(tl + (k >> 1));
+      }
+#endif  // CONFIG_EXT_TX
+      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion);
+      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse);
       rd = AOMMIN(rd1, rd2);
       if (rd >= best_yrd) return INT64_MAX;
     }
   }
 
-  *distortion = thisdistortion >> 2;
+  *distortion = thisdistortion;
   *labelyrate = thisrate;
-  *sse = thissse >> 2;
+  *sse = thissse;
 
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
@@ -2524,6 +4703,12 @@
   int64_t bsse;
   int64_t brdcost;
   int_mv mvs[2];
+#if CONFIG_REF_MV
+  int_mv pred_mv[2];
+#endif
+#if CONFIG_EXT_INTER
+  int_mv ref_mv[2];
+#endif  // CONFIG_EXT_INTER
   ENTROPY_CONTEXT ta[2];
   ENTROPY_CONTEXT tl[2];
 } SEG_RDSTAT;
@@ -2538,7 +4723,11 @@
   int64_t sse;
   int segment_yrate;
   PREDICTION_MODE modes[4];
+#if CONFIG_EXT_INTER
+  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
+#else
   SEG_RDSTAT rdstat[4][INTER_MODES];
+#endif  // CONFIG_EXT_INTER
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -2571,21 +4760,23 @@
   if (has_second_ref(mbmi)) x->e_mbd.plane[0].pre[1] = orig_pre[1];
 }
 
-static INLINE int mv_has_subpel(const MV *mv) {
-  return (mv->row & 0x0F) || (mv->col & 0x0F);
-}
-
 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
 // TODO(aconverse): Find out if this is still productive then clean up or remove
-static int check_best_zero_mv(const AV1_COMP *const cpi,
-                              const int16_t mode_context[MAX_REF_FRAMES],
-                              int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-                              int this_mode,
-                              const MV_REFERENCE_FRAME ref_frames[2],
-                              const BLOCK_SIZE bsize, int block) {
+static int check_best_zero_mv(
+    const AV1_COMP *const cpi, const int16_t mode_context[TOTAL_REFS_PER_FRAME],
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    const int16_t compound_mode_context[TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+    int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME], int this_mode,
+    const MV_REFERENCE_FRAME ref_frames[2], const BLOCK_SIZE bsize, int block) {
+
+#if !CONFIG_EXT_INTER
+  assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
+#endif
+
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
-      (ref_frames[1] == NONE ||
+      (ref_frames[1] <= INTRA_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
 #if CONFIG_REF_MV
     int16_t rfc =
@@ -2593,9 +4784,15 @@
 #else
     int16_t rfc = mode_context[ref_frames[0]];
 #endif
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    int c1 = cost_mv_ref(cpi, NEARMV, ref_frames[1] > INTRA_FRAME, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, ref_frames[1] > INTRA_FRAME, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, ref_frames[1] > INTRA_FRAME, rfc);
+#else
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
 
 #if !CONFIG_REF_MV
     (void)bsize;
@@ -2608,7 +4805,7 @@
       if (c2 > c3) return 0;
     } else {
       assert(this_mode == ZEROMV);
-      if (ref_frames[1] == NONE) {
+      if (ref_frames[1] <= INTRA_FRAME) {
         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
           return 0;
@@ -2621,12 +4818,60 @@
       }
     }
   }
+#if CONFIG_EXT_INTER
+  else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAREST_NEARMV ||
+            this_mode == NEAR_NEARESTMV || this_mode == NEAR_NEARMV ||
+            this_mode == ZERO_ZEROMV) &&
+           frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
+           frame_mv[this_mode][ref_frames[1]].as_int == 0) {
+#if CONFIG_REF_MV
+    int16_t rfc = compound_mode_context[ref_frames[0]];
+    int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, 1, rfc);
+    int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, 1, rfc);
+    int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, 1, rfc);
+    int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, 1, rfc);
+    int c5 = cost_mv_ref(cpi, NEAR_NEARMV, 1, rfc);
+#else
+    int16_t rfc = mode_context[ref_frames[0]];
+    int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
+    int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, rfc);
+    int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
+#endif
+
+    if (this_mode == NEAREST_NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEAREST_NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else if (this_mode == NEAR_NEARESTMV) {
+      if (c4 > c3) return 0;
+    } else if (this_mode == NEAR_NEARMV) {
+      if (c5 > c3) return 0;
+    } else {
+      assert(this_mode == ZERO_ZEROMV);
+      if ((c3 >= c2 && frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c1 && frame_mv[NEAREST_NEARMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAREST_NEARMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c5 && frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c4 && frame_mv[NEAR_NEARESTMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAR_NEARESTMV][ref_frames[1]].as_int == 0))
+        return 0;
+    }
+  }
+#endif  // CONFIG_EXT_INTER
   return 1;
 }
 
 static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
                                 BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row,
-                                int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
+                                int mi_col,
+#if CONFIG_EXT_INTER
+                                int_mv *ref_mv_sub8x8[2],
+#endif
+                                int_mv single_newmv[TOTAL_REFS_PER_FRAME],
                                 int *rate_mv, const int block) {
   const AV1_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
@@ -2637,6 +4882,14 @@
                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int_mv ref_mv[2];
   int ite, ref;
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4] = {
+    mbmi->interp_filter[0], mbmi->interp_filter[1], mbmi->interp_filter[2],
+    mbmi->interp_filter[3],
+  };
+#else
+  const InterpFilter interp_filter = mbmi->interp_filter;
+#endif
   struct scale_factors sf;
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -2649,14 +4902,19 @@
 
 // Prediction buffer from second frame.
 #if CONFIG_AOM_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
   for (ref = 0; ref < 2; ++ref) {
-    ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
+#if CONFIG_EXT_INTER
+    if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
+      ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
+    else
+#endif  // CONFIG_EXT_INTER
+      ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
     if (scaled_ref_frame[ref]) {
       int i;
@@ -2688,7 +4946,7 @@
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
-    MV tmp_mv;
+    MV *const best_mv = &x->best_mv.as_mv;
     int search_range = 3;
 
     int tmp_col_min = x->mv_col_min;
@@ -2703,25 +4961,33 @@
     ref_yv12[0] = xd->plane[0].pre[0];
     ref_yv12[1] = xd->plane[0].pre[1];
 
+#if CONFIG_DUAL_FILTER
+    // reload the filter types
+    interp_filter[0] =
+        (id == 0) ? mbmi->interp_filter[2] : mbmi->interp_filter[0];
+    interp_filter[1] =
+        (id == 0) ? mbmi->interp_filter[3] : mbmi->interp_filter[1];
+#endif
+
 // Get the prediction block from the 'other' reference frame.
 #if CONFIG_AOM_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       av1_highbd_build_inter_predictor(
           ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, &mbmi->interp_filter,
+          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, interp_filter,
           MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd->bd);
     } else {
       second_pred = (uint8_t *)second_pred_alloc_16;
-      av1_build_inter_predictor(
-          ref_yv12[!id].buf, ref_yv12[!id].stride, second_pred, pw,
-          &frame_mv[refs[!id]].as_mv, &sf, pw, ph, 0, &mbmi->interp_filter,
-          MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+      av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+                                second_pred, pw, &frame_mv[refs[!id]].as_mv,
+                                &sf, pw, ph, 0, interp_filter, MV_PRECISION_Q3,
+                                mi_col * MI_SIZE, mi_row * MI_SIZE);
     }
 #else
     av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
                               second_pred, pw, &frame_mv[refs[!id]].as_mv, &sf,
-                              pw, ph, 0, &mbmi->interp_filter, MV_PRECISION_Q3,
+                              pw, ph, 0, interp_filter, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
@@ -2730,20 +4996,21 @@
     av1_set_mv_search_range(x, &ref_mv[id].as_mv);
 
     // Use the mv result from the single mode as mv predictor.
-    tmp_mv = frame_mv[refs[id]].as_mv;
+    *best_mv = frame_mv[refs[id]].as_mv;
 
-    tmp_mv.col >>= 3;
-    tmp_mv.row >>= 3;
+    best_mv->col >>= 3;
+    best_mv->row >>= 3;
 
 #if CONFIG_REF_MV
     av1_set_mvcost(x, refs[id], id, mbmi->ref_mv_idx);
 #endif
+
     // Small-range full-pixel motion search.
-    bestsme = av1_refining_search_8p_c(x, &tmp_mv, sadpb, search_range,
-                                       &cpi->fn_ptr[bsize], &ref_mv[id].as_mv,
-                                       second_pred);
+    bestsme =
+        av1_refining_search_8p_c(x, sadpb, search_range, &cpi->fn_ptr[bsize],
+                                 &ref_mv[id].as_mv, second_pred);
     if (bestsme < INT_MAX)
-      bestsme = av1_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+      bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
                                       second_pred, &cpi->fn_ptr[bsize], 1);
 
     x->mv_col_min = tmp_col_min;
@@ -2763,8 +5030,10 @@
 
         // Set pred for Y plane
         setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
-                         upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
-                         NULL, pd->subsampling_x, pd->subsampling_y);
+                         upsampled_ref->y_crop_width,
+                         upsampled_ref->y_crop_height, upsampled_ref->y_stride,
+                         (mi_row << 3), (mi_col << 3), NULL, pd->subsampling_x,
+                         pd->subsampling_y);
 
         // If bsize < BLOCK_8X8, adjust pred pointer for this block
         if (bsize < BLOCK_8X8)
@@ -2774,7 +5043,7 @@
                               << 3];
 
         bestsme = cpi->find_fractional_mv_step(
-            x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
             x->errorperbit, &cpi->fn_ptr[bsize], 0,
             cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
             &dis, &sse, second_pred, pw, ph, 1);
@@ -2784,7 +5053,7 @@
       } else {
         (void)block;
         bestsme = cpi->find_fractional_mv_step(
-            x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
+            x, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
             x->errorperbit, &cpi->fn_ptr[bsize], 0,
             cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
             &dis, &sse, second_pred, pw, ph, 0);
@@ -2795,7 +5064,7 @@
     if (id) xd->plane[0].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_mv = tmp_mv;
+      frame_mv[refs[id]].as_mv = *best_mv;
       last_besterr[id] = bestsme;
     } else {
       break;
@@ -2811,10 +5080,21 @@
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[ref] = backup_yv12[ref][i];
     }
-
-    *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
-                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_REF_MV
+    av1_set_mvcost(x, refs[ref], ref, mbmi->ref_mv_idx);
+#endif
+#if CONFIG_EXT_INTER
+    if (bsize >= BLOCK_8X8)
+#endif  // CONFIG_EXT_INTER
+      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                  &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_EXT_INTER
+    else
+      *rate_mv += av1_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                  &ref_mv_sub8x8[ref]->as_mv, x->nmvjointcost,
+                                  x->mvcost, MV_COST_WEIGHT);
+#endif  // CONFIG_EXT_INTER
   }
 }
 
@@ -2822,10 +5102,18 @@
     const AV1_COMP *const cpi, MACROBLOCK *x, int_mv *best_ref_mv,
     int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
     int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
-    int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf,
-    int filter_idx, int mi_row, int mi_col) {
-  int i;
+    int mvthresh,
+#if CONFIG_EXT_INTER
+    int_mv seg_mvs[4][2][TOTAL_REFS_PER_FRAME],
+    int_mv compound_seg_newmvs[4][2],
+#else
+    int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_EXT_INTER
+    BEST_SEG_INFO *bsi_buf, int filter_idx, int mi_row, int mi_col) {
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+#if CONFIG_REF_MV
+  int_mv tmp_ref_mv[2];
+#endif
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
@@ -2853,6 +5141,12 @@
 
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
 #endif
+#if CONFIG_EXT_TX && CONFIG_RECT_TX
+  mbmi->tx_size =
+      xd->lossless[mbmi->segment_id] ? TX_4X4 : max_txsize_rect_lookup[bsize];
+#else
+  mbmi->tx_size = TX_4X4;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
 
   av1_zero(*bsi);
 
@@ -2862,7 +5156,19 @@
   bsi->mvp.as_int = best_ref_mv->as_int;
   bsi->mvthresh = mvthresh;
 
-  for (i = 0; i < 4; i++) bsi->modes[i] = ZEROMV;
+  for (idx = 0; idx < 4; ++idx) bsi->modes[idx] = ZEROMV;
+
+#if CONFIG_REFMV
+  for (idx = 0; idx < 4; ++idx) {
+    for (k = NEARESTMV; k <= NEWMV; ++k) {
+      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[0].as_int = INVALID_MV;
+      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[1].as_int = INVALID_MV;
+
+      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[0].as_int = INVALID_MV;
+      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[1].as_int = INVALID_MV;
+    }
+  }
+#endif
 
   memcpy(t_above, pd->above_context, sizeof(t_above));
   memcpy(t_left, pd->left_context, sizeof(t_left));
@@ -2879,11 +5185,19 @@
       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
       int_mv mode_mv[MB_MODE_COUNT][2];
-      int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+      int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
       PREDICTION_MODE mode_selected = ZEROMV;
       int64_t new_best_rd = INT64_MAX;
       const int index = idy * 2 + idx;
       int ref;
+#if CONFIG_REF_MV
+      CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+      uint8_t ref_mv_count[2];
+#endif
+#if CONFIG_EXT_INTER
+      int mv_idx;
+      int_mv ref_mvs_sub8x8[2][2];
+#endif  // CONFIG_EXT_INTER
 #if CONFIG_PVQ
       od_rollback_buffer idx_buf, post_buf;
       od_encode_checkpoint(&x->daala_enc, &idx_buf);
@@ -2892,24 +5206,166 @@
 
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+#if CONFIG_EXT_INTER
+        int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
+        av1_update_mv_context(xd, mi, frame, mv_ref_list, index, mi_row, mi_col,
+                              NULL);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_GLOBAL_MOTION
+        frame_mv[ZEROMV][frame].as_int =
+            gm_get_motion_vector(&cm->global_motion[frame]).as_int;
+#else   // CONFIG_GLOBAL_MOTION
         frame_mv[ZEROMV][frame].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
         av1_append_sub8x8_mvs_for_idx(cm, xd, index, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+                                      ref_mv_stack[ref], &ref_mv_count[ref],
+#endif
+#if CONFIG_EXT_INTER
+                                      mv_ref_list,
+#endif  // CONFIG_EXT_INTER
                                       &frame_mv[NEARESTMV][frame],
                                       &frame_mv[NEARMV][frame]);
+
+#if CONFIG_REF_MV
+        tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]];
+        lower_mv_precision(&tmp_ref_mv[ref].as_mv, cm->allow_high_precision_mv);
+        bsi->ref_mv[ref] = &tmp_ref_mv[ref];
+        mbmi_ext->ref_mvs[frame][0] = tmp_ref_mv[ref];
+#endif
+
+#if CONFIG_EXT_INTER
+        mv_ref_list[0].as_int = frame_mv[NEARESTMV][frame].as_int;
+        mv_ref_list[1].as_int = frame_mv[NEARMV][frame].as_int;
+        av1_find_best_ref_mvs(cm->allow_high_precision_mv, mv_ref_list,
+                              &ref_mvs_sub8x8[0][ref], &ref_mvs_sub8x8[1][ref]);
+
+        if (has_second_rf) {
+          frame_mv[ZERO_ZEROMV][frame].as_int = 0;
+          frame_mv[NEAREST_NEARESTMV][frame].as_int =
+              frame_mv[NEARESTMV][frame].as_int;
+
+          if (ref == 0) {
+            frame_mv[NEAREST_NEARMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEAR_NEARESTMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAREST_NEWMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEAR_NEWMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+          } else if (ref == 1) {
+            frame_mv[NEAREST_NEARMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARESTMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEW_NEARESTMV][frame].as_int =
+                frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEW_NEARMV][frame].as_int = frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARMV][frame].as_int =
+                frame_mv[NEARMV][frame].as_int;
+          }
+        }
+#endif  // CONFIG_EXT_INTER
       }
 
-      // search for the best motion vector on this segment
-      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+// search for the best motion vector on this segment
+#if CONFIG_EXT_INTER
+      for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV);
+           this_mode <= (has_second_rf ? NEW_NEWMV : NEWFROMNEARMV);
+           ++this_mode)
+#else
+      for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode)
+#endif  // CONFIG_EXT_INTER
+      {
         const struct buf_2d orig_src = x->plane[0].src;
         struct buf_2d orig_pre[2];
+        // This flag controls if the motion estimation will kick off. When it
+        // is set to a non-zero value, the encoder will force motion estimation.
+        int run_mv_search = 0;
 
         mode_idx = INTER_OFFSET(this_mode);
+#if CONFIG_EXT_INTER
+        mv_idx = (this_mode == NEWFROMNEARMV) ? 1 : 0;
+
+        for (ref = 0; ref < 1 + has_second_rf; ++ref)
+          bsi->ref_mv[ref]->as_int = ref_mvs_sub8x8[mv_idx][ref].as_int;
+#endif  // CONFIG_EXT_INTER
         bsi->rdstat[index][mode_idx].brdcost = INT64_MAX;
         if (!(inter_mode_mask & (1 << this_mode))) continue;
 
-        if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
-                                this_mode, mbmi->ref_frame, bsize, index))
-          continue;
+#if CONFIG_REF_MV
+        run_mv_search = 2;
+#if !CONFIG_EXT_INTER
+        if (filter_idx > 0 && this_mode == NEWMV) {
+          BEST_SEG_INFO *ref_bsi = bsi_buf;
+          SEG_RDSTAT *ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
+
+          if (has_second_rf) {
+            if (seg_mvs[index][mbmi->ref_frame[0]].as_int ==
+                    ref_rdstat->mvs[0].as_int &&
+                ref_rdstat->mvs[0].as_int != INVALID_MV)
+              if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
+                --run_mv_search;
+
+            if (seg_mvs[index][mbmi->ref_frame[1]].as_int ==
+                    ref_rdstat->mvs[1].as_int &&
+                ref_rdstat->mvs[1].as_int != INVALID_MV)
+              if (bsi->ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+                --run_mv_search;
+          } else {
+            if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int &&
+                ref_rdstat->mvs[0].as_int != INVALID_MV) {
+              run_mv_search = 0;
+              seg_mvs[index][mbmi->ref_frame[0]].as_int =
+                  ref_rdstat->mvs[0].as_int;
+            }
+          }
+
+          if (run_mv_search != 0 && filter_idx > 1) {
+            ref_bsi = bsi_buf + 1;
+            ref_rdstat = &ref_bsi->rdstat[index][mode_idx];
+            run_mv_search = 2;
+
+            if (has_second_rf) {
+              if (seg_mvs[index][mbmi->ref_frame[0]].as_int ==
+                      ref_rdstat->mvs[0].as_int &&
+                  ref_rdstat->mvs[0].as_int != INVALID_MV)
+                if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
+                  --run_mv_search;
+
+              if (seg_mvs[index][mbmi->ref_frame[1]].as_int ==
+                      ref_rdstat->mvs[1].as_int &&
+                  ref_rdstat->mvs[1].as_int != INVALID_MV)
+                if (bsi->ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+                  --run_mv_search;
+            } else {
+              if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int &&
+                  ref_rdstat->mvs[0].as_int != INVALID_MV) {
+                run_mv_search = 0;
+                seg_mvs[index][mbmi->ref_frame[0]].as_int =
+                    ref_rdstat->mvs[0].as_int;
+              }
+            }
+          }
+        }
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+
+#if CONFIG_GLOBAL_MOTION
+        if (get_gmtype(&cm->global_motion[mbmi->ref_frame[0]]) == GLOBAL_ZERO &&
+            (!has_second_rf ||
+             get_gmtype(&cm->global_motion[mbmi->ref_frame[1]]) == GLOBAL_ZERO))
+#endif  // CONFIG_GLOBAL_MOTION
+
+          if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                  mbmi_ext->compound_mode_context,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                                  frame_mv, this_mode, mbmi->ref_frame, bsize,
+                                  index))
+            continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
         memcpy(bsi->rdstat[index][mode_idx].ta, t_above,
@@ -2921,9 +5377,16 @@
 #endif
 
         // motion search for newmv (single predictor case only)
-        if (!has_second_rf && this_mode == NEWMV &&
-            seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV) {
-          MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
+        if (!has_second_rf &&
+#if CONFIG_EXT_INTER
+            have_newmv_in_inter_mode(this_mode) &&
+            (seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#else
+            this_mode == NEWMV &&
+            (seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV ||
+             run_mv_search)
+#endif  // CONFIG_EXT_INTER
+                ) {
           int step_param = 0;
           int bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
@@ -2940,19 +5403,24 @@
           if (new_best_rd < label_mv_thresh) break;
 
           if (cpi->oxcf.mode != BEST) {
-            // use previous block's result as next block's MV predictor.
+#if CONFIG_EXT_INTER
+            bsi->mvp.as_int = bsi->ref_mv[0]->as_int;
+#else
+// use previous block's result as next block's MV predictor.
+#if !CONFIG_REF_MV
             if (index > 0) {
               bsi->mvp.as_int = mi->bmi[index - 1].as_mv[0].as_int;
-              if (index == 2) {
+              if (index == 2)
                 bsi->mvp.as_int = mi->bmi[index - 2].as_mv[0].as_int;
-              }
             }
+#endif
+#endif  // CONFIG_EXT_INTER
           }
-          max_mv =
-              (index == 0)
-                  ? (int)x->max_mv_context[mbmi->ref_frame[0]]
-                  : (AOMMAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >>
-                     3);
+          max_mv = (index == 0) ? (int)x->max_mv_context[mbmi->ref_frame[0]]
+                                : AOMMAX(abs(bsi->mvp.as_mv.row),
+                                         abs(bsi->mvp.as_mv.col)) >>
+                                      3;
+
           if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
             // Take wtd average of the step_params based on the last frame's
             // max mv magnitude and the best ref mvs of the current block for
@@ -2963,8 +5431,13 @@
             step_param = cpi->mv_step_param;
           }
 
+#if CONFIG_REF_MV
+          mvp_full.row = bsi->ref_mv[0]->as_mv.row >> 3;
+          mvp_full.col = bsi->ref_mv[0]->as_mv.col >> 3;
+#else
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
+#endif
 
           if (cpi->sf.adaptive_motion_search) {
             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
@@ -2977,13 +5450,15 @@
 
           av1_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
+          x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
 #if CONFIG_REF_MV
           av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
 #endif
           bestsme = av1_full_pixel_search(
               cpi, x, bsize, &mvp_full, step_param, sadpb,
               cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
-              &bsi->ref_mv[0]->as_mv, new_mv, INT_MAX, 1);
+              &bsi->ref_mv[0]->as_mv, INT_MAX, 1);
 
           x->mv_col_min = tmp_col_min;
           x->mv_col_max = tmp_col_max;
@@ -2993,6 +5468,10 @@
           if (bestsme < INT_MAX) {
             int distortion;
             if (cpi->sf.use_upsampled_references) {
+              int best_mv_var;
+              const int try_second =
+                  x->second_best_mv.as_int != INVALID_MV &&
+                  x->second_best_mv.as_int != x->best_mv.as_int;
               const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
               const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
               // Use up-sampled reference frames.
@@ -3001,10 +5480,11 @@
                   get_upsampled_ref(cpi, mbmi->ref_frame[0]);
 
               // Set pred for Y plane
-              setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
-                               upsampled_ref->y_stride, (mi_row << 3),
-                               (mi_col << 3), NULL, pd->subsampling_x,
-                               pd->subsampling_y);
+              setup_pred_plane(
+                  &pd->pre[0], upsampled_ref->y_buffer,
+                  upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
+                  upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
+                  pd->subsampling_x, pd->subsampling_y);
 
               // adjust pred pointer for this block
               pd->pre[0].buf =
@@ -3012,56 +5492,125 @@
                                                            pd->pre[0].stride))
                                   << 3];
 
-              cpi->find_fractional_mv_step(
-                  x, new_mv, &bsi->ref_mv[0]->as_mv,
-                  cm->allow_high_precision_mv, x->errorperbit,
-                  &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              best_mv_var = cpi->find_fractional_mv_step(
+                  x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
                   &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, pw, ph,
                   1);
 
+              if (try_second) {
+                int this_var;
+                MV best_mv = x->best_mv.as_mv;
+                const MV ref_mv = bsi->ref_mv[0]->as_mv;
+                const int minc = AOMMAX(x->mv_col_min * 8, ref_mv.col - MV_MAX);
+                const int maxc = AOMMIN(x->mv_col_max * 8, ref_mv.col + MV_MAX);
+                const int minr = AOMMAX(x->mv_row_min * 8, ref_mv.row - MV_MAX);
+                const int maxr = AOMMIN(x->mv_row_max * 8, ref_mv.row + MV_MAX);
+
+                x->best_mv = x->second_best_mv;
+                if (x->best_mv.as_mv.row * 8 <= maxr &&
+                    x->best_mv.as_mv.row * 8 >= minr &&
+                    x->best_mv.as_mv.col * 8 <= maxc &&
+                    x->best_mv.as_mv.col * 8 >= minc) {
+                  this_var = cpi->find_fractional_mv_step(
+                      x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+                      x->errorperbit, &cpi->fn_ptr[bsize],
+                      cpi->sf.mv.subpel_force_stop,
+                      cpi->sf.mv.subpel_iters_per_step,
+                      cond_cost_list(cpi, cost_list), x->nmvjointcost,
+                      x->mvcost, &distortion, &x->pred_sse[mbmi->ref_frame[0]],
+                      NULL, pw, ph, 1);
+                  if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+                  x->best_mv.as_mv = best_mv;
+                }
+              }
+
               // Restore the reference frames.
               pd->pre[0] = backup_pred;
             } else {
               cpi->find_fractional_mv_step(
-                  x, new_mv, &bsi->ref_mv[0]->as_mv,
-                  cm->allow_high_precision_mv, x->errorperbit,
-                  &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+                  x, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
                   cpi->sf.mv.subpel_iters_per_step,
                   cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
                   &distortion, &x->pred_sse[mbmi->ref_frame[0]], NULL, 0, 0, 0);
             }
 
-            // save motion search result for use in compound prediction
-            seg_mvs[index][mbmi->ref_frame[0]].as_mv = *new_mv;
+// save motion search result for use in compound prediction
+#if CONFIG_EXT_INTER
+            seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#else
+            seg_mvs[index][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#endif  // CONFIG_EXT_INTER
           }
 
           if (cpi->sf.adaptive_motion_search)
-            x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
+            x->pred_mv[mbmi->ref_frame[0]] = x->best_mv.as_mv;
+
+#if CONFIG_EXT_INTER
+          mode_mv[this_mode][0] = x->best_mv;
+#else
+          mode_mv[NEWMV][0] = x->best_mv;
+#endif  // CONFIG_EXT_INTER
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
         if (has_second_rf) {
+#if CONFIG_EXT_INTER
+          if (seg_mvs[index][mv_idx][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[index][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#else
           if (seg_mvs[index][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[index][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#endif  // CONFIG_EXT_INTER
             continue;
         }
 
-        if (has_second_rf && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+#if CONFIG_DUAL_FILTER
+        (void)run_mv_search;
+#endif
+
+        if (has_second_rf &&
+#if CONFIG_EXT_INTER
+            this_mode == NEW_NEWMV &&
+#else
+            this_mode == NEWMV &&
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_DUAL_FILTER
+            (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search))
+#else
+            (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search))
+#endif
+        {
           // adjust src pointers
           mi_buf_shift(x, index);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
-                                mi_col, seg_mvs[index], &rate_mv, index);
+                                mi_col,
+#if CONFIG_EXT_INTER
+                                bsi->ref_mv, seg_mvs[index][mv_idx],
+#else
+                                seg_mvs[index],
+#endif  // CONFIG_EXT_INTER
+                                &rate_mv, index);
+#if CONFIG_EXT_INTER
+            compound_seg_newmvs[index][0].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+            compound_seg_newmvs[index][1].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#else
             seg_mvs[index][mbmi->ref_frame[0]].as_int =
                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
             seg_mvs[index][mbmi->ref_frame[1]].as_int =
                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#endif  // CONFIG_EXT_INTER
           }
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
@@ -3069,7 +5618,12 @@
 
         bsi->rdstat[index][mode_idx].brate = set_and_cost_bmi_mvs(
             cpi, x, xd, index, this_mode, mode_mv[this_mode], frame_mv,
-            seg_mvs[index], bsi->ref_mv, x->nmvjointcost, x->mvcost);
+#if CONFIG_EXT_INTER
+            seg_mvs[index][mv_idx], compound_seg_newmvs[index],
+#else
+            seg_mvs[index],
+#endif  // CONFIG_EXT_INTER
+            bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
           bsi->rdstat[index][mode_idx].mvs[ref].as_int =
@@ -3080,6 +5634,26 @@
           if (num_4x4_blocks_high > 1)
             bsi->rdstat[index + 2][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
+#if CONFIG_REF_MV
+          bsi->rdstat[index][mode_idx].pred_mv[ref].as_int =
+              mi->bmi[index].pred_mv[ref].as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[index + 1][mode_idx].pred_mv[ref].as_int =
+                mi->bmi[index].pred_mv[ref].as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[index + 2][mode_idx].pred_mv[ref].as_int =
+                mi->bmi[index].pred_mv[ref].as_int;
+#endif
+#if CONFIG_EXT_INTER
+          bsi->rdstat[index][mode_idx].ref_mv[ref].as_int =
+              bsi->ref_mv[ref]->as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[index + 1][mode_idx].ref_mv[ref].as_int =
+                bsi->ref_mv[ref]->as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[index + 2][mode_idx].ref_mv[ref].as_int =
+                bsi->ref_mv[ref]->as_int;
+#endif  // CONFIG_EXT_INTER
         }
 
         // Trap vectors that reach beyond the UMV borders
@@ -3094,22 +5668,68 @@
 
           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
-            have_ref &= mode_mv[this_mode][ref].as_int ==
-                        ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
+#if CONFIG_EXT_INTER
+            if (have_newmv_in_inter_mode(this_mode))
+              have_ref &=
+                  ((mode_mv[this_mode][ref].as_int ==
+                    ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
+                   (bsi->ref_mv[ref]->as_int ==
+                    ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
+            else
+#endif  // CONFIG_EXT_INTER
+              have_ref &= mode_mv[this_mode][ref].as_int ==
+                          ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
           }
 
+          have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
+
           if (filter_idx > 1 && !subpelmv && !have_ref) {
             ref_bsi = bsi_buf + 1;
             have_ref = 1;
             for (ref = 0; ref < 1 + has_second_rf; ++ref)
-              have_ref &= mode_mv[this_mode][ref].as_int ==
-                          ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
+#if CONFIG_EXT_INTER
+              if (have_newmv_in_inter_mode(this_mode))
+                have_ref &=
+                    ((mode_mv[this_mode][ref].as_int ==
+                      ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int) &&
+                     (bsi->ref_mv[ref]->as_int ==
+                      ref_bsi->rdstat[index][mode_idx].ref_mv[ref].as_int));
+              else
+#endif  // CONFIG_EXT_INTER
+                have_ref &= mode_mv[this_mode][ref].as_int ==
+                            ref_bsi->rdstat[index][mode_idx].mvs[ref].as_int;
+
+            have_ref &= ref_bsi->rdstat[index][mode_idx].brate > 0;
           }
 
           if (!subpelmv && have_ref &&
               ref_bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
+#if CONFIG_REF_MV
+            bsi->rdstat[index][mode_idx].byrate =
+                ref_bsi->rdstat[index][mode_idx].byrate;
+            bsi->rdstat[index][mode_idx].bdist =
+                ref_bsi->rdstat[index][mode_idx].bdist;
+            bsi->rdstat[index][mode_idx].bsse =
+                ref_bsi->rdstat[index][mode_idx].bsse;
+            bsi->rdstat[index][mode_idx].brate +=
+                ref_bsi->rdstat[index][mode_idx].byrate;
+            bsi->rdstat[index][mode_idx].eobs =
+                ref_bsi->rdstat[index][mode_idx].eobs;
+
+            bsi->rdstat[index][mode_idx].brdcost =
+                RDCOST(x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate,
+                       bsi->rdstat[index][mode_idx].bdist);
+
+            memcpy(bsi->rdstat[index][mode_idx].ta,
+                   ref_bsi->rdstat[index][mode_idx].ta,
+                   sizeof(bsi->rdstat[index][mode_idx].ta));
+            memcpy(bsi->rdstat[index][mode_idx].tl,
+                   ref_bsi->rdstat[index][mode_idx].tl,
+                   sizeof(bsi->rdstat[index][mode_idx].tl));
+#else
             memcpy(&bsi->rdstat[index][mode_idx],
                    &ref_bsi->rdstat[index][mode_idx], sizeof(SEG_RDSTAT));
+#endif
             if (num_4x4_blocks_wide > 1)
               bsi->rdstat[index + 1][mode_idx].eobs =
                   ref_bsi->rdstat[index + 1][mode_idx].eobs;
@@ -3118,6 +5738,24 @@
                   ref_bsi->rdstat[index + 2][mode_idx].eobs;
 
             if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
+#if CONFIG_REF_MV
+              // If the NEWMV mode is using the same motion vector as the
+              // NEARESTMV mode, skip the rest rate-distortion calculations
+              // and use the inferred motion vector modes.
+              if (this_mode == NEWMV) {
+                if (has_second_rf) {
+                  if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                          bsi->ref_mv[0]->as_int &&
+                      bsi->rdstat[index][mode_idx].mvs[1].as_int ==
+                          bsi->ref_mv[1]->as_int)
+                    continue;
+                } else {
+                  if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                      bsi->ref_mv[0]->as_int)
+                    continue;
+                }
+              }
+#endif
               mode_selected = this_mode;
               new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
 #if CONFIG_PVQ
@@ -3134,6 +5772,7 @@
             &bsi->rdstat[index][mode_idx].bdist,
             &bsi->rdstat[index][mode_idx].bsse, bsi->rdstat[index][mode_idx].ta,
             bsi->rdstat[index][mode_idx].tl, idy, idx, mi_row, mi_col);
+
         if (bsi->rdstat[index][mode_idx].brdcost < INT64_MAX) {
           bsi->rdstat[index][mode_idx].brdcost += RDCOST(
               x->rdmult, x->rddiv, bsi->rdstat[index][mode_idx].brate, 0);
@@ -3147,6 +5786,24 @@
         }
 
         if (bsi->rdstat[index][mode_idx].brdcost < new_best_rd) {
+#if CONFIG_REF_MV
+          // If the NEWMV mode is using the same motion vector as the
+          // NEARESTMV mode, skip the rest rate-distortion calculations
+          // and use the inferred motion vector modes.
+          if (this_mode == NEWMV) {
+            if (has_second_rf) {
+              if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                      bsi->ref_mv[0]->as_int &&
+                  bsi->rdstat[index][mode_idx].mvs[1].as_int ==
+                      bsi->ref_mv[1]->as_int)
+                continue;
+            } else {
+              if (bsi->rdstat[index][mode_idx].mvs[0].as_int ==
+                  bsi->ref_mv[0]->as_int)
+                continue;
+            }
+          }
+#endif
           mode_selected = this_mode;
           new_best_rd = bsi->rdstat[index][mode_idx].brdcost;
 
@@ -3159,7 +5816,11 @@
       if (new_best_rd == INT64_MAX) {
         int iy, midx;
         for (iy = index + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
           for (midx = 0; midx < INTER_MODES; ++midx)
+#endif  // CONFIG_EXT_INTER
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
 #if CONFIG_PVQ
@@ -3175,8 +5836,19 @@
       od_encode_rollback(&x->daala_enc, &post_buf);
 #endif
 
+#if CONFIG_EXT_INTER
+      mv_idx = (mode_selected == NEWFROMNEARMV) ? 1 : 0;
+      bsi->ref_mv[0]->as_int = bsi->rdstat[index][mode_idx].ref_mv[0].as_int;
+      if (has_second_rf)
+        bsi->ref_mv[1]->as_int = bsi->rdstat[index][mode_idx].ref_mv[1].as_int;
+#endif  // CONFIG_EXT_INTER
       set_and_cost_bmi_mvs(cpi, x, xd, index, mode_selected,
-                           mode_mv[mode_selected], frame_mv, seg_mvs[index],
+                           mode_mv[mode_selected], frame_mv,
+#if CONFIG_EXT_INTER
+                           seg_mvs[index][mv_idx], compound_seg_newmvs[index],
+#else
+                           seg_mvs[index],
+#endif  // CONFIG_EXT_INTER
                            bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
       br += bsi->rdstat[index][mode_idx].brate;
@@ -3188,7 +5860,11 @@
       if (this_segment_rd > bsi->segment_rd) {
         int iy, midx;
         for (iy = index + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
           for (midx = 0; midx < INTER_MODES; ++midx)
+#endif  // CONFIG_EXT_INTER
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
 #if CONFIG_PVQ
@@ -3213,13 +5889,24 @@
 
   if (bsi->segment_rd > best_rd) return INT64_MAX;
   /* set it to the best */
-  for (i = 0; i < 4; i++) {
-    mode_idx = INTER_OFFSET(bsi->modes[i]);
-    mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
+  for (idx = 0; idx < 4; idx++) {
+    mode_idx = INTER_OFFSET(bsi->modes[idx]);
+    mi->bmi[idx].as_mv[0].as_int = bsi->rdstat[idx][mode_idx].mvs[0].as_int;
     if (has_second_ref(mbmi))
-      mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
-    x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
-    mi->bmi[i].as_mode = bsi->modes[i];
+      mi->bmi[idx].as_mv[1].as_int = bsi->rdstat[idx][mode_idx].mvs[1].as_int;
+#if CONFIG_REF_MV
+    mi->bmi[idx].pred_mv[0] = bsi->rdstat[idx][mode_idx].pred_mv[0];
+    if (has_second_ref(mbmi))
+      mi->bmi[idx].pred_mv[1] = bsi->rdstat[idx][mode_idx].pred_mv[1];
+#endif
+#if CONFIG_EXT_INTER
+    mi->bmi[idx].ref_mv[0].as_int = bsi->rdstat[idx][mode_idx].ref_mv[0].as_int;
+    if (has_second_rf)
+      mi->bmi[idx].ref_mv[1].as_int =
+          bsi->rdstat[idx][mode_idx].ref_mv[1].as_int;
+#endif  // CONFIG_EXT_INTER
+    x->plane[0].eobs[idx] = bsi->rdstat[idx][mode_idx].eobs;
+    mi->bmi[idx].as_mode = bsi->modes[idx];
   }
 
   /*
@@ -3243,8 +5930,9 @@
   int seg_ref_active =
       segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
-    memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
-    memset(ref_costs_comp, 0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+    memset(ref_costs_single, 0,
+           TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_single));
+    memset(ref_costs_comp, 0, TOTAL_REFS_PER_FRAME * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
     aom_prob intra_inter_p = av1_get_intra_inter_prob(cm, xd);
@@ -3270,9 +5958,6 @@
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
 
-      if (cm->reference_mode == REFERENCE_MODE_SELECT)
-        base_cost += av1_cost_bit(comp_inter_p, 0);
-
       ref_costs_single[LAST_FRAME] =
 #if CONFIG_EXT_REFS
           ref_costs_single[LAST2_FRAME] = ref_costs_single[LAST3_FRAME] =
@@ -3306,6 +5991,7 @@
       ref_costs_single[LAST_FRAME] += av1_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p1, 1);
+
       ref_costs_single[GOLDEN_FRAME] += av1_cost_bit(ref_single_p2, 0);
       ref_costs_single[ALTREF_FRAME] += av1_cost_bit(ref_single_p2, 1);
 #endif  // CONFIG_EXT_REFS
@@ -3319,21 +6005,17 @@
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
     }
+
     if (cm->reference_mode != SINGLE_REFERENCE) {
-#if CONFIG_EXT_REFS
-      aom_prob fwdref_comp_p = av1_get_pred_prob_comp_fwdref_p(cm, xd);
-      aom_prob fwdref_comp_p1 = av1_get_pred_prob_comp_fwdref_p1(cm, xd);
-      aom_prob fwdref_comp_p2 = av1_get_pred_prob_comp_fwdref_p2(cm, xd);
-      aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
-#else
       aom_prob ref_comp_p = av1_get_pred_prob_comp_ref_p(cm, xd);
+#if CONFIG_EXT_REFS
+      aom_prob ref_comp_p1 = av1_get_pred_prob_comp_ref_p1(cm, xd);
+      aom_prob ref_comp_p2 = av1_get_pred_prob_comp_ref_p2(cm, xd);
+      aom_prob bwdref_comp_p = av1_get_pred_prob_comp_bwdref_p(cm, xd);
 #endif  // CONFIG_EXT_REFS
 
       unsigned int base_cost = av1_cost_bit(intra_inter_p, 1);
 
-      if (cm->reference_mode == REFERENCE_MODE_SELECT)
-        base_cost += av1_cost_bit(comp_inter_p, 1);
-
       ref_costs_comp[LAST_FRAME] =
 #if CONFIG_EXT_REFS
           ref_costs_comp[LAST2_FRAME] = ref_costs_comp[LAST3_FRAME] =
@@ -3345,18 +6027,19 @@
 #endif  // CONFIG_EXT_REFS
 
 #if CONFIG_EXT_REFS
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(fwdref_comp_p, 0);
-      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(fwdref_comp_p, 0);
-      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(fwdref_comp_p, 1);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(fwdref_comp_p, 1);
+      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p, 1);
+      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p, 1);
 
-      ref_costs_comp[LAST_FRAME] += av1_cost_bit(fwdref_comp_p1, 1);
-      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(fwdref_comp_p1, 0);
+      ref_costs_comp[LAST_FRAME] += av1_cost_bit(ref_comp_p1, 1);
+      ref_costs_comp[LAST2_FRAME] += av1_cost_bit(ref_comp_p1, 0);
 
-      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(fwdref_comp_p2, 0);
-      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(fwdref_comp_p2, 1);
+      ref_costs_comp[LAST3_FRAME] += av1_cost_bit(ref_comp_p2, 0);
+      ref_costs_comp[GOLDEN_FRAME] += av1_cost_bit(ref_comp_p2, 1);
 
-      // NOTE: BWDREF and ALTREF each add an extra cost by coding 1 more bit.
+      // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
+      //               more bit.
       ref_costs_comp[BWDREF_FRAME] += av1_cost_bit(bwdref_comp_p, 0);
       ref_costs_comp[ALTREF_FRAME] += av1_cost_bit(bwdref_comp_p, 1);
 #else
@@ -3376,11 +6059,11 @@
   }
 }
 
-static void store_coding_context(const MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                                  int mode_index,
                                  int64_t comp_pred_diff[REFERENCE_MODES],
                                  int skippable) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
+  MACROBLOCKD *const xd = &x->e_mbd;
 
   // Take a snapshot of the coding context so it can be
   // restored if we decide to encode this way
@@ -3397,9 +6080,9 @@
 static void setup_buffer_inter(const AV1_COMP *const cpi, MACROBLOCK *x,
                                MV_REFERENCE_FRAME ref_frame,
                                BLOCK_SIZE block_size, int mi_row, int mi_col,
-                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
-                               int_mv frame_near_mv[MAX_REF_FRAMES],
-                               struct buf_2d yv12_mb[MAX_REF_FRAMES]
+                               int_mv frame_nearest_mv[TOTAL_REFS_PER_FRAME],
+                               int_mv frame_near_mv[TOTAL_REFS_PER_FRAME],
+                               struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME]
                                                     [MAX_MB_PLANE]) {
   const AV1_COMMON *cm = &cpi->common;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
@@ -3420,6 +6103,9 @@
       cm, xd, mi, ref_frame,
 #if CONFIG_REF_MV
       &mbmi_ext->ref_mv_count[ref_frame], mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+      mbmi_ext->compound_mode_context,
+#endif  // CONFIG_EXT_INTER
 #endif
       candidates, mi_row, mi_col, NULL, NULL, mbmi_ext->mode_context);
 
@@ -3438,17 +6124,26 @@
 
 static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 int_mv *tmp_mv, int *rate_mv) {
+#if CONFIG_EXT_INTER
+                                 int ref_idx, int mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                 int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const AV1_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
   int bestsme = INT_MAX;
   int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
+#if CONFIG_EXT_INTER
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][mv_idx].as_mv;
+#else
   int ref = mbmi->ref_frame[0];
   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  int ref_idx = 0;
+#endif  // CONFIG_EXT_INTER
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -3469,13 +6164,16 @@
     // Swap out the reference frame for a version that's been scaled to
     // match the resolution of the current frame, allowing the existing
     // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
 
-    av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
+  av1_set_mv_search_range(x, &ref_mv);
+
 #if CONFIG_REF_MV
-  av1_set_mvcost(x, ref, 0, mbmi->ref_mv_idx);
+  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
 #endif
 
   // Work out the size of the first step in the mv step search.
@@ -3491,9 +6189,9 @@
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
     int boffset =
-        2 * (b_width_log2_lookup[BLOCK_64X64] -
+        2 * (b_width_log2_lookup[cm->sb_size] -
              AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = AOMMAX(step_param, boffset);
   }
@@ -3512,12 +6210,12 @@
         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
           x->pred_mv[ref].row = 0;
           x->pred_mv[ref].col = 0;
-          tmp_mv->as_int = INVALID_MV;
+          x->best_mv.as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
             int j;
             for (j = 0; j < MAX_MB_PLANE; ++j)
-              xd->plane[j].pre[0] = backup_yv12[j];
+              xd->plane[j].pre[ref_idx] = backup_yv12[j];
           }
           return;
         }
@@ -3537,20 +6235,22 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
+  x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
 #if CONFIG_MOTION_VAR
   switch (mbmi->motion_mode) {
     case SIMPLE_TRANSLATION:
 #endif  // CONFIG_MOTION_VAR
       bestsme = av1_full_pixel_search(cpi, x, bsize, &mvp_full, step_param,
                                       sadpb, cond_cost_list(cpi, cost_list),
-                                      &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+                                      &ref_mv, INT_MAX, 1);
 #if CONFIG_MOTION_VAR
       break;
     case OBMC_CAUSAL:
       bestsme = av1_obmc_full_pixel_diamond(
           cpi, x, &mvp_full, step_param, sadpb,
           MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
-          &tmp_mv->as_mv, 0);
+          &(x->best_mv.as_mv), 0);
       break;
     default: assert("Invalid motion mode!\n");
   }
@@ -3568,32 +6268,60 @@
       case SIMPLE_TRANSLATION:
 #endif  // CONFIG_MOTION_VAR
         if (cpi->sf.use_upsampled_references) {
+          int best_mv_var;
+          const int try_second = x->second_best_mv.as_int != INVALID_MV &&
+                                 x->second_best_mv.as_int != x->best_mv.as_int;
           const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
           const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
           // Use up-sampled reference frames.
           struct macroblockd_plane *const pd = &xd->plane[0];
-          struct buf_2d backup_pred = pd->pre[0];
+          struct buf_2d backup_pred = pd->pre[ref_idx];
           const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
 
           // Set pred for Y plane
-          setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
-                           upsampled_ref->y_stride, (mi_row << 3),
-                           (mi_col << 3), NULL, pd->subsampling_x,
-                           pd->subsampling_y);
+          setup_pred_plane(
+              &pd->pre[ref_idx], upsampled_ref->y_buffer,
+              upsampled_ref->y_crop_width, upsampled_ref->y_crop_height,
+              upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3), NULL,
+              pd->subsampling_x, pd->subsampling_y);
 
-          bestsme = cpi->find_fractional_mv_step(
-              x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv,
-              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+          best_mv_var = cpi->find_fractional_mv_step(
+              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
               1);
 
+          if (try_second) {
+            const int minc = AOMMAX(x->mv_col_min * 8, ref_mv.col - MV_MAX);
+            const int maxc = AOMMIN(x->mv_col_max * 8, ref_mv.col + MV_MAX);
+            const int minr = AOMMAX(x->mv_row_min * 8, ref_mv.row - MV_MAX);
+            const int maxr = AOMMIN(x->mv_row_max * 8, ref_mv.row + MV_MAX);
+            int this_var;
+            MV best_mv = x->best_mv.as_mv;
+
+            x->best_mv = x->second_best_mv;
+            if (x->best_mv.as_mv.row * 8 <= maxr &&
+                x->best_mv.as_mv.row * 8 >= minr &&
+                x->best_mv.as_mv.col * 8 <= maxc &&
+                x->best_mv.as_mv.col * 8 >= minc) {
+              this_var = cpi->find_fractional_mv_step(
+                  x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+                  &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+                  &dis, &x->pred_sse[ref], NULL, pw, ph, 1);
+              if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+              x->best_mv.as_mv = best_mv;
+            }
+          }
+
           // Restore the reference frames.
-          pd->pre[0] = backup_pred;
+          pd->pre[ref_idx] = backup_pred;
         } else {
           cpi->find_fractional_mv_step(
-              x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv,
-              x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+              x, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
+              &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
               cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
               x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
               0);
@@ -3602,7 +6330,7 @@
         break;
       case OBMC_CAUSAL:
         av1_find_best_obmc_sub_pixel_tree_up(
-            cpi, x, mi_row, mi_col, &tmp_mv->as_mv, &ref_mv,
+            cpi, x, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
             cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
             cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
             x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
@@ -3612,7 +6340,7 @@
     }
 #endif  // CONFIG_MOTION_VAR
   }
-  *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
+  *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
                              x->mvcost, MV_COST_WEIGHT);
 
 #if CONFIG_MOTION_VAR
@@ -3620,11 +6348,12 @@
 #else
   if (cpi->sf.adaptive_motion_search)
 #endif  // CONFIG_MOTION_VAR
-    x->pred_mv[ref] = tmp_mv->as_mv;
+    x->pred_mv[ref] = x->best_mv.as_mv;
 
   if (scaled_ref_frame) {
     int i;
-    for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
   }
 }
 
@@ -3638,6 +6367,163 @@
   }
 }
 
+#if CONFIG_EXT_INTER
+static void do_masked_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+                                    const uint8_t *mask, int mask_stride,
+                                    BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                    int_mv *tmp_mv, int *rate_mv, int ref_idx,
+                                    int mv_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const AV1_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][mv_idx].as_mv;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      av1_get_scaled_ref_frame(cpi, ref);
+  int i;
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref];
+
+#if CONFIG_REF_MV
+  av1_set_mvcost(x, ref, ref_idx, mbmi->ref_mv_idx);
+#endif
+
+  if (scaled_ref_frame) {
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+    av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  av1_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param =
+        (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+        2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  // TODO(debargha): is show_frame needed here?
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size && cm->show_frame) {
+    int boffset =
+        2 * (b_width_log2_lookup[cm->sb_size] -
+             AOMMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = AOMMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5) step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int j;
+            for (j = 0; j < MAX_MB_PLANE; ++j)
+              xd->plane[j].pre[ref_idx] = backup_yv12[j];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = av1_masked_full_pixel_diamond(
+      cpi, x, mask, mask_stride, &mvp_full, step_param, sadpb,
+      MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
+      &tmp_mv->as_mv, ref_idx);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis; /* TODO: use dis in distortion calculation later. */
+    av1_find_best_masked_sub_pixel_tree_up(
+        cpi, x, mask, mask_stride, mi_row, mi_col, &tmp_mv->as_mv, &ref_mv,
+        cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+        cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], ref_idx,
+        cpi->sf.use_upsampled_references);
+  }
+  *rate_mv = av1_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
+                             x->mvcost, MV_COST_WEIGHT);
+
+  if (cpi->sf.adaptive_motion_search && cm->show_frame)
+    x->pred_mv[ref] = tmp_mv->as_mv;
+
+  if (scaled_ref_frame) {
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+  }
+}
+
+static void do_masked_motion_search_indexed(const AV1_COMP *const cpi,
+                                            MACROBLOCK *x, int wedge_index,
+                                            int wedge_sign, BLOCK_SIZE bsize,
+                                            int mi_row, int mi_col,
+                                            int_mv *tmp_mv, int *rate_mv,
+                                            int mv_idx[2], int which) {
+  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE sb_type = mbmi->sb_type;
+  const uint8_t *mask;
+  const int mask_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+  mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+
+  if (which == 0 || which == 2)
+    do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
+                            &tmp_mv[0], &rate_mv[0], 0, mv_idx[0]);
+
+  if (which == 1 || which == 2) {
+    // get the negative mask
+    mask = av1_get_contiguous_soft_mask(wedge_index, !wedge_sign, sb_type);
+    do_masked_motion_search(cpi, x, mask, mask_stride, bsize, mi_row, mi_col,
+                            &tmp_mv[1], &rate_mv[1], 1, mv_idx[1]);
+  }
+}
+#endif  // CONFIG_EXT_INTER
+
 // In some situations we want to discount tha pparent cost of a new motion
 // vector. Where there is a subtle motion field and especially where there is
 // low spatial complexity then it can be hard to cover the cost of a new motion
@@ -3647,7 +6533,7 @@
 // visual quality.
 static int discount_newmv_test(const AV1_COMP *const cpi, int this_mode,
                                int_mv this_mv,
-                               int_mv (*mode_mv)[MAX_REF_FRAMES],
+                               int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME],
                                int ref_frame) {
   return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
           (this_mv.as_int != 0) &&
@@ -3668,18 +6554,355 @@
            xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+#if CONFIG_EXT_INTER
+static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+                               const BLOCK_SIZE bsize, const uint8_t *pred0,
+                               int stride0, const uint8_t *pred1, int stride1) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int f_index = bsize - BLOCK_8X8;
+  const int bw = 4 << (b_width_log2_lookup[bsize]);
+  const int bh = 4 << (b_height_log2_lookup[bsize]);
+  uint32_t esq[2][4], var;
+  int64_t tl, br;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  var = cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+  var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2,
+                                stride0, &esq[0][1]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                                pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                                pred0 + bh / 2 * stride0 + bw / 2, stride0,
+                                &esq[0][3]);
+  var = cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+  var = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2,
+                                stride1, &esq[1][1]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+                                pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+  var = cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+                                pred1 + bh / 2 * stride1 + bw / 2, stride0,
+                                &esq[1][3]);
+  (void)var;
+
+  tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
+       (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
+  br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
+       (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
+  return (tl + br > 0);
+}
+#endif  // CONFIG_EXT_INTER
+
+#if !CONFIG_DUAL_FILTER
+static InterpFilter predict_interp_filter(
+    const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE bsize,
+    const int mi_row, const int mi_col,
+    InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME]) {
+  InterpFilter best_filter = SWITCHABLE;
+  const AV1_COMMON *cm = &cpi->common;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int bsl = mi_width_log2_lookup[bsize];
+  int pred_filter_search =
+      cpi->sf.cb_pred_filter_search
+          ? (((mi_row + mi_col) >> bsl) +
+             get_chessboard_index(cm->current_video_frame)) &
+                0x1
+          : 0;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int this_mode = mbmi->mode;
+  int refs[2] = { mbmi->ref_frame[0],
+                  (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  if (pred_filter_search) {
+    InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
+    if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+    if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter;
+
+#if CONFIG_EXT_INTER
+    if ((this_mode != NEWMV && this_mode != NEWFROMNEARMV &&
+         this_mode != NEW_NEWMV) ||
+        (af == lf))
+#else
+    if ((this_mode != NEWMV) || (af == lf))
+#endif  // CONFIG_EXT_INTER
+      best_filter = af;
+  }
+  if (is_comp_pred) {
+    if (cpi->sf.adaptive_mode_search) {
+#if CONFIG_EXT_INTER
+      switch (this_mode) {
+        case NEAREST_NEARESTMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAREST_NEARMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAR_NEARESTMV:
+          if (single_filter[NEARMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case NEAR_NEARMV:
+          if (single_filter[NEARMV][refs[0]] == single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case ZERO_ZEROMV:
+          if (single_filter[ZEROMV][refs[0]] == single_filter[ZEROMV][refs[1]])
+            best_filter = single_filter[ZEROMV][refs[0]];
+          break;
+        case NEW_NEWMV:
+          if (single_filter[NEWMV][refs[0]] == single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        case NEAREST_NEWMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAR_NEWMV:
+          if (single_filter[NEARMV][refs[0]] == single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case NEW_NEARESTMV:
+          if (single_filter[NEWMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        case NEW_NEARMV:
+          if (single_filter[NEWMV][refs[0]] == single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        default:
+          if (single_filter[this_mode][refs[0]] ==
+              single_filter[this_mode][refs[1]])
+            best_filter = single_filter[this_mode][refs[0]];
+          break;
+      }
+#else
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+#endif  // CONFIG_EXT_INTER
+    }
+  }
+  if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+    best_filter = EIGHTTAP_REGULAR;
+  }
+  return best_filter;
+}
+#endif
+
+#if CONFIG_EXT_INTER
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+                          const BLOCK_SIZE bsize, const uint8_t *const p0,
+                          const uint8_t *const p1, int *const best_wedge_sign,
+                          int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int N = bw * bh;
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_sign;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+#if CONFIG_AOM_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+  const int bd_round = 0;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+
+  int64_t sign_limit;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else  // NOLINT
+#endif    // CONFIG_AOM_HIGHBITDEPTH
+  {
+    aom_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
+    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+  }
+
+  sign_limit = ((int64_t)aom_sum_squares_i16(r0, N) -
+                (int64_t)aom_sum_squares_i16(r1, N)) *
+               (1 << WEDGE_WEIGHT_BITS) / 2;
+
+  av1_wedge_compute_delta_squares(ds, r0, r1, N);
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+    wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      *best_wedge_sign = wedge_sign;
+      best_rd = rd;
+    }
+  }
+
+  return best_rd;
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(
+    const AV1_COMP *const cpi, const MACROBLOCK *const x,
+    const BLOCK_SIZE bsize, const uint8_t *const p0, const uint8_t *const p1,
+    const int wedge_sign, int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int N = bw * bh;
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+#if CONFIG_AOM_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+  const int bd_round = 0;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (hbd) {
+    aom_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    aom_highbd_subtract_block(bh, bw, d10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else  // NOLINT
+#endif    // CONFIG_AOM_HIGHBITDEPTH
+  {
+    aom_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+    aom_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+  }
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = av1_wedge_sse_from_residuals(r1, d10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      best_rd = rd;
+    }
+  }
+
+  return best_rd;
+}
+
+static int64_t pick_interinter_wedge(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+
+  int64_t rd;
+  int wedge_index = -1;
+  int wedge_sign = 0;
+
+  assert(is_interinter_wedge_used(bsize));
+
+  if (cpi->sf.fast_wedge_sign_estimate) {
+    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
+  } else {
+    rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
+  }
+
+  mbmi->interinter_wedge_sign = wedge_sign;
+  mbmi->interinter_wedge_index = wedge_index;
+  return rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  int64_t rd;
+  int wedge_index = -1;
+
+  assert(is_interintra_wedge_used(bsize));
+
+  rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
+
+  mbmi->interintra_wedge_sign = 0;
+  mbmi->interintra_wedge_index = wedge_index;
+  return rd;
+}
+#endif  // CONFIG_EXT_INTER
+
 static int64_t handle_inter_mode(
     const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int *rate2,
     int64_t *distortion, int *skippable, int *rate_y, int *rate_uv,
-    int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row,
+    int *disable_skip, int_mv (*mode_mv)[TOTAL_REFS_PER_FRAME], int mi_row,
     int mi_col,
 #if CONFIG_MOTION_VAR
     uint8_t *above_pred_buf[3], int above_pred_stride[3],
     uint8_t *left_pred_buf[3], int left_pred_stride[3],
 #endif  // CONFIG_MOTION_VAR
-    int_mv single_newmv[MAX_REF_FRAMES],
-    InterpFilter (*single_filter)[MAX_REF_FRAMES],
-    int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse,
+#if CONFIG_EXT_INTER
+    int_mv single_newmvs[2][TOTAL_REFS_PER_FRAME],
+    int single_newmvs_rate[2][TOTAL_REFS_PER_FRAME],
+    int *compmode_interintra_cost, int *compmode_wedge_cost,
+    int64_t (*const modelled_rd)[TOTAL_REFS_PER_FRAME],
+#else
+    int_mv single_newmv[TOTAL_REFS_PER_FRAME],
+#endif  // CONFIG_EXT_INTER
+    InterpFilter (*single_filter)[TOTAL_REFS_PER_FRAME],
+    int (*single_skippable)[TOTAL_REFS_PER_FRAME], int64_t *psse,
     const int64_t ref_best_rd) {
   const AV1_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
@@ -3692,50 +6915,136 @@
   int refs[2] = { mbmi->ref_frame[0],
                   (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
+  int rate_mv = 0;
+#if CONFIG_EXT_INTER
+  int pred_exists = 1;
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int mv_idx = (this_mode == NEWFROMNEARMV) ? 1 : 0;
+  int_mv single_newmv[TOTAL_REFS_PER_FRAME];
+  const unsigned int *const interintra_mode_cost =
+      cpi->interintra_mode_cost[size_group_lookup[bsize]];
+  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
+#if CONFIG_REF_MV
+  uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+#endif
+#endif  // CONFIG_EXT_INTER
 #if CONFIG_AOM_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
-  uint8_t *tmp_buf;
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
 #else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+  uint8_t *tmp_buf;
+
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  int allow_motvar =
+#if CONFIG_EXT_INTER
+      !is_comp_interintra_pred &&
+#endif  // CONFIG_EXT_INTER
+      is_motion_variation_allowed(mbmi);
+  int rate2_nocoeff = 0, best_rate2 = INT_MAX, best_skippable, best_xskip,
+      best_disable_skip = 0;
+  int best_rate_y, best_rate_uv;
+#if CONFIG_VAR_TX
+  uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#endif  // CONFIG_VAR_TX
+  int64_t best_distortion = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
+  MB_MODE_INFO best_mbmi;
+#if CONFIG_EXT_INTER
+  int rate2_bmc_nocoeff;
+  int rate_mv_bmc;
+  MB_MODE_INFO best_bmc_mbmi;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
   int64_t rd = INT64_MAX;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
   uint8_t *tmp_dst[MAX_MB_PLANE];
   int tmp_dst_stride[MAX_MB_PLANE];
+  int rs = 0;
   InterpFilter assign_filter = SWITCHABLE;
 
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search =
-      cpi->sf.cb_pred_filter_search
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
-
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
   int64_t distortion_y = 0, distortion_uv = 0;
   int16_t mode_ctx = mbmi_ext->mode_context[refs[0]];
-#if CONFIG_MOTION_VAR
-  int allow_motion_variation = is_motion_variation_allowed(mbmi);
-  int rate2_nocoeff, best_rate2 = INT_MAX, best_rate_y, best_rate_uv,
-                     best_skippable, best_xskip, best_disable_skip = 0;
-  int64_t best_distortion = INT64_MAX;
-  MB_MODE_INFO best_mbmi;
-#endif  // CONFIG_MOTION_VAR
-  int tmp_rate;
-  int64_t tmp_dist;
-  int rate_mv = 0;
-  int rs;
+
+#if CONFIG_EXT_INTER
+  *compmode_interintra_cost = 0;
+  mbmi->use_wedge_interintra = 0;
+  *compmode_wedge_cost = 0;
+  mbmi->use_wedge_interinter = 0;
+
+  // is_comp_interintra_pred implies !is_comp_pred
+  assert(!is_comp_interintra_pred || (!is_comp_pred));
+  // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
+  assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_REF_MV
-  mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame,
-                                       bsize, -1);
+#if CONFIG_EXT_INTER
+  if (is_comp_pred)
+    mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+    mode_ctx = av1_mode_context_analyzer(mbmi_ext->mode_context,
+                                         mbmi->ref_frame, bsize, -1);
 #endif
 
-  if (this_mode == NEWMV) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+  else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+    tmp_buf = tmp_buf_;
+
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+  }
+
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+  if (have_newmv_in_inter_mode(this_mode)) {
     if (is_comp_pred) {
+#if CONFIG_EXT_INTER
+      for (i = 0; i < 2; ++i) {
+        single_newmv[refs[i]].as_int = single_newmvs[mv_idx][refs[i]].as_int;
+      }
+
+      if (this_mode == NEW_NEWMV) {
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+          joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, NULL,
+                              single_newmv, &rate_mv, 0);
+        } else {
+#if CONFIG_REF_MV
+          av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+          rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                    &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_REF_MV
+          av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
+          rate_mv += av1_mv_bit_cost(
+              &frame_mv[refs[1]].as_mv, &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        }
+      } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+        rate_mv = av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      } else {
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
+#else
       // Initialize mv using single prediction mode result.
       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
@@ -3744,27 +7053,46 @@
         joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
                             single_newmv, &rate_mv, 0);
       } else {
+#if CONFIG_REF_MV
+        av1_set_mvcost(x, mbmi->ref_frame[0], 0, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
         rate_mv = av1_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                  &mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_REF_MV
+        av1_set_mvcost(x, mbmi->ref_frame[1], 1, mbmi->ref_mv_idx);
+#endif  // CONFIG_REF_MV
         rate_mv += av1_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                                   &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                   &mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
+#endif  // CONFIG_EXT_INTER
     } else {
-      int_mv tmp_mv;
-      single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
-      if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
+#if CONFIG_EXT_INTER
+      if (is_comp_interintra_pred) {
+        x->best_mv = single_newmvs[mv_idx][refs[0]];
+        rate_mv = single_newmvs_rate[mv_idx][refs[0]];
+      } else {
+        single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, mv_idx,
+                             &rate_mv);
+        single_newmvs[mv_idx][refs[0]] = x->best_mv;
+        single_newmvs_rate[mv_idx][refs[0]] = rate_mv;
+      }
+#else
+      single_motion_search(cpi, x, bsize, mi_row, mi_col, &rate_mv);
+      single_newmv[refs[0]] = x->best_mv;
+#endif  // CONFIG_EXT_INTER
 
-      frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
-          tmp_mv.as_int;
-      single_newmv[refs[0]].as_int = tmp_mv.as_int;
+      if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+      frame_mv[refs[0]] = x->best_mv;
+      xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
 
       // Estimate the rate implications of a new mv but discount this
       // under certain circumstances where we want to help initiate a weak
       // motion field, where the distortion gain for a single block may not
       // be enough to overcome the cost of a new mv.
-      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+      if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
         rate_mv = AOMMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
       }
     }
@@ -3773,22 +7101,33 @@
 
   for (i = 0; i < is_comp_pred + 1; ++i) {
     cur_mv[i] = frame_mv[refs[i]];
-    // Clip "next_nearest" so that it does not extend to far out of image
-    if (this_mode != NEWMV) clamp_mv2(&cur_mv[i].as_mv, xd);
+// Clip "next_nearest" so that it does not extend to far out of image
+#if CONFIG_EXT_INTER
+    if (this_mode != NEWMV && this_mode != NEWFROMNEARMV)
+#else
+    if (this_mode != NEWMV)
+#endif  // CONFIG_EXT_INTER
+      clamp_mv2(&cur_mv[i].as_mv, xd);
 
     if (mv_check_bounds(x, &cur_mv[i].as_mv)) return INT64_MAX;
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
 #if CONFIG_REF_MV
-  if (this_mode == NEARESTMV && is_comp_pred) {
+#if CONFIG_EXT_INTER
+  if (this_mode == NEAREST_NEARESTMV)
+#else
+  if (this_mode == NEARESTMV && is_comp_pred)
+#endif  // CONFIG_EXT_INTER
+  {
+#if !CONFIG_EXT_INTER
     uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+#endif
     if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
 
       for (i = 0; i < 2; ++i) {
-        lower_mv_precision(&cur_mv[i].as_mv, cm->allow_high_precision_mv);
         clamp_mv2(&cur_mv[i].as_mv, xd);
         if (mv_check_bounds(x, &cur_mv[i].as_mv)) return INT64_MAX;
         mbmi->mv[i].as_int = cur_mv[i].as_int;
@@ -3796,38 +7135,74 @@
     }
   }
 
+#if CONFIG_EXT_INTER
+  if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
+    if (this_mode == NEAREST_NEWMV || this_mode == NEAREST_NEARMV) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[0].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[0].as_mv)) return INT64_MAX;
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    }
+
+    if (this_mode == NEW_NEARESTMV || this_mode == NEAR_NEARESTMV) {
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[1].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[1].as_mv)) return INT64_MAX;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    }
+  }
+
+  if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+    if (this_mode == NEAR_NEWMV || this_mode == NEAR_NEARESTMV ||
+        this_mode == NEAR_NEARMV) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][1].this_mv;
+
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[0].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[0].as_mv)) return INT64_MAX;
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    }
+
+    if (this_mode == NEW_NEARMV || this_mode == NEAREST_NEARMV ||
+        this_mode == NEAR_NEARMV) {
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][1].comp_mv;
+
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[1].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[1].as_mv)) return INT64_MAX;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    }
+  }
+#else
   if (this_mode == NEARMV && is_comp_pred) {
     uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
-    const int ref_mv_idx = mbmi->ref_mv_idx + 1;
     if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+      int ref_mv_idx = mbmi->ref_mv_idx + 1;
       cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
       cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
 
       for (i = 0; i < 2; ++i) {
-        lower_mv_precision(&cur_mv[i].as_mv, cm->allow_high_precision_mv);
         clamp_mv2(&cur_mv[i].as_mv, xd);
         if (mv_check_bounds(x, &cur_mv[i].as_mv)) return INT64_MAX;
         mbmi->mv[i].as_int = cur_mv[i].as_int;
       }
     }
   }
-#endif
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
 
-// do first prediction into the destination buffer. Do the next
-// prediction into a temporary buffer. Then keep track of which one
-// of these currently holds the best predictor, and use the other
-// one for future predictions. In the end, copy from tmp_buf to
-// dst if necessary.
-#if CONFIG_AOM_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
-  } else {
-    tmp_buf = (uint8_t *)tmp_buf16;
-  }
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+  // do first prediction into the destination buffer. Do the next
+  // prediction into a temporary buffer. Then keep track of which one
+  // of these currently holds the best predictor, and use the other
+  // one for future predictions. In the end, copy from tmp_buf to
+  // dst if necessary.
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    tmp_dst[i] = tmp_buf + i * 64 * 64;
-    tmp_dst_stride[i] = 64;
+    tmp_dst[i] = tmp_buf + i * MAX_SB_SQUARE;
+    tmp_dst_stride[i] = MAX_SB_SIZE;
   }
   for (i = 0; i < MAX_MB_PLANE; i++) {
     orig_dst[i] = xd->plane[i].dst.buf;
@@ -3841,115 +7216,482 @@
   //
   // Under some circumstances we discount the cost of new mv mode to encourage
   // initiation of a motion field.
-  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, refs[0]))
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
+                          refs[0])) {
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    *rate2 += AOMMIN(cost_mv_ref(cpi, this_mode, is_comp_pred, mode_ctx),
+                     cost_mv_ref(cpi, NEARESTMV, is_comp_pred, mode_ctx));
+#else
     *rate2 += AOMMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
                      cost_mv_ref(cpi, NEARESTMV, mode_ctx));
-  else
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+  } else {
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    *rate2 += cost_mv_ref(cpi, this_mode, is_comp_pred, mode_ctx);
+#else
     *rate2 += cost_mv_ref(cpi, this_mode, mode_ctx);
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+  }
 
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
-      mbmi->mode != NEARESTMV)
+#if CONFIG_EXT_INTER
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
+#else
+      mbmi->mode != NEARESTMV
+#endif  // CONFIG_EXT_INTER
+      )
     return INT64_MAX;
 
   if (cm->interp_filter == SWITCHABLE) {
-    if (pred_filter_search) {
-      InterpFilter af = SWITCHABLE, lf = SWITCHABLE;
-      if (xd->up_available) af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-      if (xd->left_available) lf = xd->mi[-1]->mbmi.interp_filter;
-
-      if (this_mode != NEWMV || af == lf) assign_filter = af;
-    }
-
-    if (is_comp_pred) {
-      if (frame_mv[refs[0]].as_int == INVALID_MV ||
-          frame_mv[refs[1]].as_int == INVALID_MV) {
-        return INT64_MAX;
-      }
-      if (cpi->sf.adaptive_mode_search) {
-        if (single_filter[this_mode][refs[0]] ==
-            single_filter[this_mode][refs[1]]) {
-          assign_filter = single_filter[this_mode][refs[0]];
-        }
-      }
-    }
-    if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
-      assign_filter = EIGHTTAP;
-    }
-#if CONFIG_EXT_INTERP
-    if (!is_interp_needed(xd)) assign_filter = EIGHTTAP;
+#if !CONFIG_DUAL_FILTER
+    assign_filter =
+        predict_interp_filter(cpi, x, bsize, mi_row, mi_col, single_filter);
+#endif
+#if CONFIG_EXT_INTERP || CONFIG_DUAL_FILTER
+    if (!av1_is_interp_needed(xd)) assign_filter = EIGHTTAP_REGULAR;
 #endif
   } else {
     assign_filter = cm->interp_filter;
   }
 
-  mbmi->interp_filter = assign_filter == SWITCHABLE ? EIGHTTAP : assign_filter;
-  rs = av1_get_switchable_rate(cpi, xd);
-  av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-  model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
-                  &skip_sse_sb);
-  rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+  {  // Do interpolation filter search in the parentheses
+    int tmp_rate;
+    int64_t tmp_dist;
+#if CONFIG_DUAL_FILTER
+    mbmi->interp_filter[0] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+    mbmi->interp_filter[1] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+    mbmi->interp_filter[2] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+    mbmi->interp_filter[3] =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+#else
+    mbmi->interp_filter =
+        assign_filter == SWITCHABLE ? EIGHTTAP_REGULAR : assign_filter;
+#endif
+    rs = av1_get_switchable_rate(cpi, xd);
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                    &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
 
-  if (assign_filter == SWITCHABLE) {
-    // do interp_filter search
-    if (is_interp_needed(xd)) {
-      InterpFilter best_filter = mbmi->interp_filter;
-      int best_in_temp = 0;
-      restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
-      for (i = EIGHTTAP + 1; i < SWITCHABLE_FILTERS; ++i) {
-        int tmp_skip_sb = 0;
-        int64_t tmp_skip_sse = INT64_MAX;
-        int64_t tmp_rd;
-        int tmp_rs;
-        mbmi->interp_filter = i;
-        tmp_rs = av1_get_switchable_rate(cpi, xd);
-        av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &tmp_skip_sb,
-                        &tmp_skip_sse);
-        tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
+    if (assign_filter == SWITCHABLE) {
+      // do interp_filter search
+      if (av1_is_interp_needed(xd)) {
+        int best_in_temp = 0;
+#if CONFIG_DUAL_FILTER
+        InterpFilter best_filter[4];
+        av1_copy(best_filter, mbmi->interp_filter);
+#else
+        InterpFilter best_filter = mbmi->interp_filter;
+#endif
+        restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+#if CONFIG_DUAL_FILTER
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < SWITCHABLE_FILTERS * SWITCHABLE_FILTERS; ++i)
+#else
+        // EIGHTTAP_REGULAR mode is calculated beforehand
+        for (i = 1; i < SWITCHABLE_FILTERS; ++i)
+#endif
+        {
+          int tmp_skip_sb = 0;
+          int64_t tmp_skip_sse = INT64_MAX;
+          int tmp_rs;
+          int64_t tmp_rd;
+#if CONFIG_DUAL_FILTER
+          mbmi->interp_filter[0] = filter_sets[i][0];
+          mbmi->interp_filter[1] = filter_sets[i][1];
+          mbmi->interp_filter[2] = filter_sets[i][0];
+          mbmi->interp_filter[3] = filter_sets[i][1];
+#else
+          mbmi->interp_filter = i;
+#endif
+          tmp_rs = av1_get_switchable_rate(cpi, xd);
+          av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                          &tmp_dist, &tmp_skip_sb, &tmp_skip_sse);
+          tmp_rd = RDCOST(x->rdmult, x->rddiv, tmp_rs + tmp_rate, tmp_dist);
 
-        if (tmp_rd < rd) {
-          rd = tmp_rd;
-          best_filter = mbmi->interp_filter;
-          skip_txfm_sb = tmp_skip_sb;
-          skip_sse_sb = tmp_skip_sse;
-          best_in_temp = !best_in_temp;
-          if (best_in_temp) {
-            restore_dst_buf(xd, orig_dst, orig_dst_stride);
-          } else {
-            restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+          if (tmp_rd < rd) {
+            rd = tmp_rd;
+            rs = av1_get_switchable_rate(cpi, xd);
+#if CONFIG_DUAL_FILTER
+            av1_copy(best_filter, mbmi->interp_filter);
+#else
+            best_filter = mbmi->interp_filter;
+#endif
+            skip_txfm_sb = tmp_skip_sb;
+            skip_sse_sb = tmp_skip_sse;
+            best_in_temp = !best_in_temp;
+            if (best_in_temp) {
+              restore_dst_buf(xd, orig_dst, orig_dst_stride);
+            } else {
+              restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+            }
           }
         }
-      }
-      if (best_in_temp) {
-        restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
-      } else {
-        restore_dst_buf(xd, orig_dst, orig_dst_stride);
-      }
-      mbmi->interp_filter = best_filter;
-    } else {
-#if !CONFIG_EXT_INTERP
-      int best_rs = av1_get_switchable_rate(cpi, xd);
-      int tmp_rs;
-      InterpFilter best_filter = mbmi->interp_filter;
-      for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
-        mbmi->interp_filter = i;
-        tmp_rs = av1_get_switchable_rate(cpi, xd);
-        if (tmp_rs < best_rs) {
-          best_rs = tmp_rs;
-          best_filter = i;
+        if (best_in_temp) {
+          restore_dst_buf(xd, tmp_dst, tmp_dst_stride);
+        } else {
+          restore_dst_buf(xd, orig_dst, orig_dst_stride);
         }
-      }
-      mbmi->interp_filter = best_filter;
+#if CONFIG_DUAL_FILTER
+        av1_copy(mbmi->interp_filter, best_filter);
 #else
-      assert(0);
+        mbmi->interp_filter = best_filter;
 #endif
+      } else {
+#if !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
+        int tmp_rs;
+        InterpFilter best_filter = mbmi->interp_filter;
+        rs = av1_get_switchable_rate(cpi, xd);
+        for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+          mbmi->interp_filter = i;
+          tmp_rs = av1_get_switchable_rate(cpi, xd);
+          if (tmp_rs < rs) {
+            rs = tmp_rs;
+            best_filter = i;
+          }
+        }
+        mbmi->interp_filter = best_filter;
+#else
+        assert(0);
+#endif
+      }
     }
   }
 
-  if (cm->interp_filter != SWITCHABLE)
-    assert(cm->interp_filter == mbmi->interp_filter);
+#if CONFIG_EXT_INTER
+#if CONFIG_MOTION_VAR
+  best_bmc_mbmi = *mbmi;
+  rate_mv_bmc = rate_mv;
+  rate2_bmc_nocoeff = *rate2;
+  if (cm->interp_filter == SWITCHABLE) rate2_bmc_nocoeff += rs;
+#endif  // CONFIG_MOTION_VAR
 
+  if (is_comp_pred && is_interinter_wedge_used(bsize)) {
+    int rate_sum, rs2;
+    int64_t dist_sum;
+    int64_t best_rd_nowedge = INT64_MAX;
+    int64_t best_rd_wedge = INT64_MAX;
+    int tmp_skip_txfm_sb;
+    int64_t tmp_skip_sse_sb;
+
+    rs2 = av1_cost_bit(cm->fc->wedge_interinter_prob[bsize], 0);
+    mbmi->use_wedge_interinter = 0;
+    av1_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    av1_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+    best_rd_nowedge = rd;
+
+    // Disbale wedge search if source variance is small
+    if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+        best_rd_nowedge / 3 < ref_best_rd) {
+      uint8_t pred0[2 * MAX_SB_SQUARE];
+      uint8_t pred1[2 * MAX_SB_SQUARE];
+      uint8_t *preds0[1] = { pred0 };
+      uint8_t *preds1[1] = { pred1 };
+      int strides[1] = { bw };
+
+      mbmi->use_wedge_interinter = 1;
+      rs2 = av1_cost_literal(get_interinter_wedge_bits(bsize)) +
+            av1_cost_bit(cm->fc->wedge_interinter_prob[bsize], 1);
+
+      av1_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides);
+      av1_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
+
+      // Choose the best wedge
+      best_rd_wedge = pick_interinter_wedge(cpi, x, bsize, pred0, pred1);
+      best_rd_wedge += RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv, 0);
+
+      if (have_newmv_in_inter_mode(this_mode)) {
+        int_mv tmp_mv[2];
+        int rate_mvs[2], tmp_rate_mv = 0;
+        if (this_mode == NEW_NEWMV) {
+          int mv_idxs[2] = { 0, 0 };
+          do_masked_motion_search_indexed(
+              cpi, x, mbmi->interinter_wedge_index, mbmi->interinter_wedge_sign,
+              bsize, mi_row, mi_col, tmp_mv, rate_mvs, mv_idxs, 2);
+          tmp_rate_mv = rate_mvs[0] + rate_mvs[1];
+          mbmi->mv[0].as_int = tmp_mv[0].as_int;
+          mbmi->mv[1].as_int = tmp_mv[1].as_int;
+        } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+          int mv_idxs[2] = { 0, 0 };
+          do_masked_motion_search_indexed(
+              cpi, x, mbmi->interinter_wedge_index, mbmi->interinter_wedge_sign,
+              bsize, mi_row, mi_col, tmp_mv, rate_mvs, mv_idxs, 0);
+          tmp_rate_mv = rate_mvs[0];
+          mbmi->mv[0].as_int = tmp_mv[0].as_int;
+        } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+          int mv_idxs[2] = { 0, 0 };
+          do_masked_motion_search_indexed(
+              cpi, x, mbmi->interinter_wedge_index, mbmi->interinter_wedge_sign,
+              bsize, mi_row, mi_col, tmp_mv, rate_mvs, mv_idxs, 1);
+          tmp_rate_mv = rate_mvs[1];
+          mbmi->mv[1].as_int = tmp_mv[1].as_int;
+        }
+        av1_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                        &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+        rd =
+            RDCOST(x->rdmult, x->rddiv, rs2 + tmp_rate_mv + rate_sum, dist_sum);
+        if (rd < best_rd_wedge) {
+          best_rd_wedge = rd;
+        } else {
+          mbmi->mv[0].as_int = cur_mv[0].as_int;
+          mbmi->mv[1].as_int = cur_mv[1].as_int;
+          tmp_rate_mv = rate_mv;
+          av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                   strides, preds1, strides);
+        }
+        av1_subtract_plane(x, bsize, 0);
+        rd =
+            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, x->rddiv, rs2 + tmp_rate_mv + rate_sum,
+                      dist_sum);
+        best_rd_wedge = rd;
+
+        if (best_rd_wedge < best_rd_nowedge) {
+          mbmi->use_wedge_interinter = 1;
+          xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+          xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
+          *rate2 += tmp_rate_mv - rate_mv;
+          rate_mv = tmp_rate_mv;
+        } else {
+          mbmi->use_wedge_interinter = 0;
+          mbmi->mv[0].as_int = cur_mv[0].as_int;
+          mbmi->mv[1].as_int = cur_mv[1].as_int;
+          xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+          xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
+        }
+      } else {
+        av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+                                                 strides, preds1, strides);
+        av1_subtract_plane(x, bsize, 0);
+        rd =
+            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, x->rddiv, rs2 + rate_mv + rate_sum, dist_sum);
+        best_rd_wedge = rd;
+        if (best_rd_wedge < best_rd_nowedge) {
+          mbmi->use_wedge_interinter = 1;
+        } else {
+          mbmi->use_wedge_interinter = 0;
+        }
+      }
+    }
+    if (ref_best_rd < INT64_MAX &&
+        AOMMIN(best_rd_wedge, best_rd_nowedge) / 3 > ref_best_rd)
+      return INT64_MAX;
+
+    pred_exists = 0;
+
+    if (mbmi->use_wedge_interinter)
+      *compmode_wedge_cost =
+          av1_cost_literal(get_interinter_wedge_bits(bsize)) +
+          av1_cost_bit(cm->fc->wedge_interinter_prob[bsize], 1);
+    else
+      *compmode_wedge_cost =
+          av1_cost_bit(cm->fc->wedge_interinter_prob[bsize], 0);
+  }
+
+  if (is_comp_interintra_pred) {
+    INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+    int64_t best_interintra_rd = INT64_MAX;
+    int rmode, rate_sum;
+    int64_t dist_sum;
+    int j;
+    int64_t best_interintra_rd_nowedge = INT64_MAX;
+    int64_t best_interintra_rd_wedge = INT64_MAX;
+    int rwedge;
+    int_mv tmp_mv;
+    int tmp_rate_mv = 0;
+    int tmp_skip_txfm_sb;
+    int64_t tmp_skip_sse_sb;
+    DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
+    uint8_t *intrapred;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      intrapred = CONVERT_TO_BYTEPTR(intrapred_);
+    else
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      intrapred = intrapred_;
+
+    mbmi->ref_frame[1] = NONE;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
+      xd->plane[j].dst.stride = bw;
+    }
+    av1_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    restore_dst_buf(xd, orig_dst, orig_dst_stride);
+    mbmi->ref_frame[1] = INTRA_FRAME;
+    mbmi->use_wedge_interintra = 0;
+
+    for (j = 0; j < INTERINTRA_MODES; ++j) {
+      mbmi->interintra_mode = (INTERINTRA_MODE)j;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      av1_build_intra_predictors_for_interintra(xd, bsize, 0, intrapred, bw);
+      av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                      &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+      rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
+      if (rd < best_interintra_rd) {
+        best_interintra_rd = rd;
+        best_interintra_mode = mbmi->interintra_mode;
+      }
+    }
+    mbmi->interintra_mode = best_interintra_mode;
+    rmode = interintra_mode_cost[mbmi->interintra_mode];
+    av1_build_intra_predictors_for_interintra(xd, bsize, 0, intrapred, bw);
+    av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+    av1_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+    best_interintra_rd = rd;
+
+    if (ref_best_rd < INT64_MAX && best_interintra_rd > 2 * ref_best_rd) {
+      return INT64_MAX;
+    }
+    if (is_interintra_wedge_used(bsize)) {
+      rwedge = av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
+      if (rd != INT64_MAX)
+        rd = RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge + rate_sum,
+                    dist_sum);
+      best_interintra_rd_nowedge = rd;
+
+      // Disbale wedge search if source variance is small
+      if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+        mbmi->use_wedge_interintra = 1;
+
+        rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+                 av1_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
+
+        best_interintra_rd_wedge =
+            pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+        best_interintra_rd_wedge +=
+            RDCOST(x->rdmult, x->rddiv, rmode + rate_mv + rwedge, 0);
+        // Refine motion vector.
+        if (have_newmv_in_inter_mode(this_mode)) {
+          // get negative of mask
+          const uint8_t *mask = av1_get_contiguous_soft_mask(
+              mbmi->interintra_wedge_index, 1, bsize);
+          do_masked_motion_search(cpi, x, mask, bw, bsize, mi_row, mi_col,
+                                  &tmp_mv, &tmp_rate_mv, 0, mv_idx);
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          av1_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                          &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+          rd = RDCOST(x->rdmult, x->rddiv,
+                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+          if (rd < best_interintra_rd_wedge) {
+            best_interintra_rd_wedge = rd;
+          } else {
+            tmp_mv.as_int = cur_mv[0].as_int;
+            tmp_rate_mv = rate_mv;
+          }
+        } else {
+          tmp_mv.as_int = cur_mv[0].as_int;
+          tmp_rate_mv = rate_mv;
+          av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+        }
+        // Evaluate closer to true rd
+        av1_subtract_plane(x, bsize, 0);
+        rd =
+            estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, x->rddiv,
+                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+        best_interintra_rd_wedge = rd;
+        if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+          mbmi->use_wedge_interintra = 1;
+          best_interintra_rd = best_interintra_rd_wedge;
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          *rate2 += tmp_rate_mv - rate_mv;
+          rate_mv = tmp_rate_mv;
+        } else {
+          mbmi->use_wedge_interintra = 0;
+          best_interintra_rd = best_interintra_rd_nowedge;
+          mbmi->mv[0].as_int = cur_mv[0].as_int;
+        }
+      } else {
+        mbmi->use_wedge_interintra = 0;
+        best_interintra_rd = best_interintra_rd_nowedge;
+      }
+    }
+
+    pred_exists = 0;
+    *compmode_interintra_cost =
+        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
+    *compmode_interintra_cost += interintra_mode_cost[mbmi->interintra_mode];
+    if (is_interintra_wedge_used(bsize)) {
+      *compmode_interintra_cost += av1_cost_bit(
+          cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
+      if (mbmi->use_wedge_interintra) {
+        *compmode_interintra_cost +=
+            av1_cost_literal(get_interintra_wedge_bits(bsize));
+      }
+    }
+  } else if (is_interintra_allowed(mbmi)) {
+    *compmode_interintra_cost =
+        av1_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
+  }
+
+#if CONFIG_EXT_INTERP
+  if (!av1_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = EIGHTTAP_REGULAR;
+#else
+    mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif
+    pred_exists = 0;
+  }
+#endif  // CONFIG_EXT_INTERP
+  if (pred_exists == 0) {
+    int tmp_rate;
+    int64_t tmp_dist;
+    av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                    &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
+    rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_DUAL_FILTER
+  if (!is_comp_pred) single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
+#else
   if (!is_comp_pred) single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+#endif
+
+#if CONFIG_EXT_INTER
+  if (modelled_rd != NULL) {
+    if (is_comp_pred) {
+      const int mode0 = compound_ref0_mode(this_mode);
+      const int mode1 = compound_ref1_mode(this_mode);
+      int64_t mrd =
+          AOMMIN(modelled_rd[mode0][refs[0]], modelled_rd[mode1][refs[1]]);
+      if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+        restore_dst_buf(xd, orig_dst, orig_dst_stride);
+        return INT64_MAX;
+      }
+    } else if (!is_comp_interintra_pred) {
+      modelled_rd[this_mode][refs[0]] = rd;
+    }
+  }
+#endif  // CONFIG_EXT_INTER
 
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
@@ -3960,79 +7702,151 @@
     }
   }
 
-  *rate2 += av1_get_switchable_rate(cpi, xd);
+  if (cm->interp_filter == SWITCHABLE) *rate2 += rs;
 #if CONFIG_MOTION_VAR
   rate2_nocoeff = *rate2;
 #endif  // CONFIG_MOTION_VAR
 
-#if CONFIG_MOTION_VAR
-  rd = INT64_MAX;
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+  best_rd = INT64_MAX;
   for (mbmi->motion_mode = SIMPLE_TRANSLATION;
-       mbmi->motion_mode < (allow_motion_variation ? MOTION_MODES : 1);
+       mbmi->motion_mode < (allow_motvar ? MOTION_MODES : 1);
        mbmi->motion_mode++) {
-    int64_t tmp_rd;
+    int64_t tmp_rd = INT64_MAX;
+#if CONFIG_EXT_INTER
+    int tmp_rate2 = mbmi->motion_mode != SIMPLE_TRANSLATION ? rate2_bmc_nocoeff
+                                                            : rate2_nocoeff;
+#else
     int tmp_rate2 = rate2_nocoeff;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+    InterpFilter obmc_interp_filter[2][2] = {
+      { mbmi->interp_filter[0], mbmi->interp_filter[1] },  // obmc == 0
+      { mbmi->interp_filter[0], mbmi->interp_filter[1] }   // obmc == 1
+    };
+#else
+    InterpFilter obmc_interp_filter[2] = {
+      mbmi->interp_filter,  // obmc == 0
+      mbmi->interp_filter   // obmc == 1
+    };
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
 
+#if CONFIG_MOTION_VAR
+    int tmp_rate;
+    int64_t tmp_dist;
     if (mbmi->motion_mode == OBMC_CAUSAL) {
-      if (!is_comp_pred && this_mode == NEWMV) {
-        int_mv tmp_mv;
+#if CONFIG_EXT_INTER
+      *mbmi = best_bmc_mbmi;
+      mbmi->motion_mode = OBMC_CAUSAL;
+#endif  // CONFIG_EXT_INTER
+      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
         int tmp_rate_mv = 0;
 
-        single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv,
+        single_motion_search(cpi, x, bsize, mi_row, mi_col,
+#if CONFIG_EXT_INTER
+                             0, mv_idx,
+#endif  // CONFIG_EXT_INTER
                              &tmp_rate_mv);
-        mbmi->mv[0].as_int = tmp_mv.as_int;
-        if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+        mbmi->mv[0].as_int = x->best_mv.as_int;
+        if (discount_newmv_test(cpi, this_mode, mbmi->mv[0], mode_mv,
+                                refs[0])) {
           tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
         }
+#if CONFIG_EXT_INTER
+        tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
+#else
         tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+#endif  // CONFIG_EXT_INTER
 #if CONFIG_EXT_INTERP
-        if (cm->interp_filter = SWITCHABLE && !is_interp_needed(xd)) {
-          tmp_rate2 -= rs;
-          mbmi->interp_filter = EIGHT_TAP;
-        }
+#if CONFIG_DUAL_FILTER
+        if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+          obmc_interp_filter[1][0] = mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+        if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+          obmc_interp_filter[1][1] = mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#else
+        if (!av1_is_interp_needed(xd))
+          obmc_interp_filter[1] = mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+        // This is not quite correct with CONFIG_DUAL_FILTER when a filter
+        // is needed in only one direction
+        if (!av1_is_interp_needed(xd)) tmp_rate2 -= rs;
 #endif  // CONFIG_EXT_INTERP
         av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#if CONFIG_EXT_INTER
+      } else {
+        av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#endif  // CONFIG_EXT_INTER
       }
       av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, above_pred_buf,
                                       above_pred_stride, left_pred_buf,
                                       left_pred_stride);
-      model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
-                      &skip_sse_sb);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1, &tmp_rate,
+                      &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     }
+#endif  // CONFIG_MOTION_VAR
 
+#if CONFIG_WARPED_MOTION
+    if (mbmi->motion_mode == WARPED_CAUSAL) {
+      // TODO(yuec): Add code
+    }
+#endif  // CONFIG_WARPED_MOTION
     x->skip = 0;
 
     *rate2 = tmp_rate2;
-    if (allow_motion_variation)
-      *rate2 += cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+    if (allow_motvar) *rate2 += cpi->motion_mode_cost[bsize][mbmi->motion_mode];
     *distortion = 0;
-#endif  // CONFIG_MOTION_VAR
-
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     if (!skip_txfm_sb) {
       int skippable_y, skippable_uv;
       int64_t sseuv = INT64_MAX;
       int64_t rdcosty = INT64_MAX;
+      int is_cost_valid_uv = 0;
+#if CONFIG_VAR_TX
+      RD_STATS rd_stats_uv;
+#endif
 
 // Y cost and distortion
 #if !CONFIG_PVQ
       av1_subtract_plane(x, bsize, 0);
 #endif
-      super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize,
-                      ref_best_rd);
+#if CONFIG_VAR_TX
+      if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+        RD_STATS rd_stats_y;
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, ref_best_rd);
+        *rate_y = rd_stats_y.rate;
+        distortion_y = rd_stats_y.dist;
+        skippable_y = rd_stats_y.skip;
+        *psse = rd_stats_y.sse;
+      } else {
+        int idx, idy;
+        super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                        bsize, ref_best_rd);
+        for (idy = 0; idy < xd->n8_h; ++idy)
+          for (idx = 0; idx < xd->n8_w; ++idx)
+            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+        memset(x->blk_skip[0], skippable_y,
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+      }
+#else
+    super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize,
+                    ref_best_rd);
+#endif  // CONFIG_VAR_TX
 
       if (*rate_y == INT_MAX) {
         *rate2 = INT_MAX;
         *distortion = INT64_MAX;
-#if CONFIG_MOTION_VAR
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         if (mbmi->motion_mode != SIMPLE_TRANSLATION) {
           continue;
         } else {
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
           restore_dst_buf(xd, orig_dst, orig_dst_stride);
           return INT64_MAX;
-#if CONFIG_MOTION_VAR
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         }
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       }
 
       *rate2 += *rate_y;
@@ -4040,24 +7854,39 @@
 
       rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
       rdcosty = AOMMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
-
-      if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
-                            &sseuv, bsize, ref_best_rd - rdcosty)) {
+/* clang-format off */
+#if CONFIG_VAR_TX
+      is_cost_valid_uv =
+          inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, ref_best_rd - rdcosty);
+#if CONFIG_RD_DEBUG
+      // record uv planes' transform block coefficient cost
+      if (is_cost_valid_uv) av1_merge_rd_stats(&mbmi->rd_stats, &rd_stats_uv);
+#endif
+      *rate_uv = rd_stats_uv.rate;
+      distortion_uv = rd_stats_uv.dist;
+      skippable_uv = rd_stats_uv.skip;
+      sseuv = rd_stats_uv.sse;
+#else
+      is_cost_valid_uv =
+          super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                           &sseuv, bsize, ref_best_rd - rdcosty);
+#endif  // CONFIG_VAR_TX
+      if (!is_cost_valid_uv) {
         *rate2 = INT_MAX;
         *distortion = INT64_MAX;
-#if CONFIG_MOTION_VAR
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
         continue;
 #else
-      restore_dst_buf(xd, orig_dst, orig_dst_stride);
-      return INT64_MAX;
-#endif  // CONFIG_MOTION_VAR
+        restore_dst_buf(xd, orig_dst, orig_dst_stride);
+        return INT64_MAX;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       }
-
+      /* clang-format on */
       *psse += sseuv;
       *rate2 += *rate_uv;
       *distortion += distortion_uv;
       *skippable = skippable_y && skippable_uv;
-#if CONFIG_MOTION_VAR
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       if (*skippable) {
         *rate2 -= *rate_uv + *rate_y;
         *rate_y = 0;
@@ -4083,36 +7912,72 @@
         mbmi->skip = 0;
       }
       *disable_skip = 0;
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     } else {
       x->skip = 1;
       *disable_skip = 1;
-#if CONFIG_MOTION_VAR
-      mbmi->skip = 0;
-#endif  // CONFIG_MOTION_VAR
+      mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode, 1);
 
-      // The cost of skip bit needs to be added.
+// The cost of skip bit needs to be added.
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      mbmi->skip = 0;
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
       *rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
 
       *distortion = skip_sse_sb;
+      *psse = skip_sse_sb;
+      *rate_y = 0;
+      *rate_uv = 0;
+      *skippable = 1;
     }
+#if CONFIG_GLOBAL_MOTION
+    if (this_mode == ZEROMV) {
+      *rate2 += GLOBAL_MOTION_RATE(mbmi->ref_frame[0]);
+      if (is_comp_pred) *rate2 += GLOBAL_MOTION_RATE(mbmi->ref_frame[1]);
+    }
+#endif  // CONFIG_GLOBAL_MOTION
 
-#if CONFIG_MOTION_VAR
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
     tmp_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
-    if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < rd)) {
+    if (mbmi->motion_mode == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = obmc_interp_filter[mbmi->motion_mode][0];
+      mbmi->interp_filter[1] = obmc_interp_filter[mbmi->motion_mode][1];
+#else
+      mbmi->interp_filter = obmc_interp_filter[mbmi->motion_mode];
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
       best_mbmi = *mbmi;
-      rd = tmp_rd;
+      best_rd = tmp_rd;
       best_rate2 = *rate2;
       best_rate_y = *rate_y;
       best_rate_uv = *rate_uv;
+#if CONFIG_VAR_TX
+      for (i = 0; i < MAX_MB_PLANE; ++i)
+        memcpy(best_blk_skip[i], x->blk_skip[i],
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif  // CONFIG_VAR_TX
       best_distortion = *distortion;
       best_skippable = *skippable;
       best_xskip = x->skip;
       best_disable_skip = *disable_skip;
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        x->recon_variance = av1_high_get_sby_perpixel_variance(
+            cpi, &xd->plane[0].dst, bsize, xd->bd);
+      } else {
+        x->recon_variance =
+            av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+      }
+#else
+      x->recon_variance =
+          av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
     }
   }
 
-  if (rd == INT64_MAX) {
+  if (best_rd == INT64_MAX) {
     *rate2 = INT_MAX;
     *distortion = INT64_MAX;
     restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -4122,14 +7987,34 @@
   *rate2 = best_rate2;
   *rate_y = best_rate_y;
   *rate_uv = best_rate_uv;
+#if CONFIG_VAR_TX
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    memcpy(x->blk_skip[i], best_blk_skip[i],
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif  // CONFIG_VAR_TX
   *distortion = best_distortion;
   *skippable = best_skippable;
   x->skip = best_xskip;
   *disable_skip = best_disable_skip;
-#endif  // CONFIG_MOTION_VAR
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
 
   if (!is_comp_pred) single_skippable[this_mode][refs[0]] = *skippable;
 
+#if !(CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION)
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->recon_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &xd->plane[0].dst, bsize, xd->bd);
+  } else {
+    x->recon_variance =
+        av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  x->recon_variance =
+      av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // !(CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION)
+
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return 0;  // The rate-distortion cost will be re-calculated by caller.
 }
@@ -4155,16 +8040,14 @@
       return;
     }
   } else {
-    y_skip = 0;
     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                                     &dist_y, best_rd) >= best_rd) {
+                                     &dist_y, &y_skip, best_rd) >= best_rd) {
       rd_cost->rate = INT_MAX;
       return;
     }
   }
-  max_uv_tx_size = get_uv_tx_size_impl(
-      xd->mi[0]->mbmi.tx_size, bsize, pd[1].subsampling_x, pd[1].subsampling_y);
-
+  max_uv_tx_size = uv_txsize_lookup[bsize][xd->mi[0]->mbmi.tx_size]
+                                   [pd[1].subsampling_x][pd[1].subsampling_y];
   rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
                           &uv_skip, AOMMAX(BLOCK_8X8, bsize), max_uv_tx_size);
 
@@ -4248,13 +8131,12 @@
 // In most cases this is the "real" edge unless there are formatting
 // bars embedded in the stream.
 int av1_active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
-  return av1_active_h_edge(cpi, mi_row, MAX_MIB_SIZE) ||
-         av1_active_v_edge(cpi, mi_col, MAX_MIB_SIZE);
+  return av1_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
+         av1_active_v_edge(cpi, mi_col, cpi->common.mib_size);
 }
 
 #if CONFIG_PALETTE
-static void restore_uv_color_map(const AV1_COMP *const cpi,
-                                 MACROBLOCK *const x) {
+static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
@@ -4273,9 +8155,8 @@
 #if CONFIG_AOM_HIGHBITDEPTH
   const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
   const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
-#else
-  (void)cpi;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+  (void)cpi;
 
   for (r = 0; r < rows; ++r) {
     for (c = 0; c < cols; ++c) {
@@ -4304,6 +8185,188 @@
 }
 #endif  // CONFIG_PALETTE
 
+#if CONFIG_FILTER_INTRA
+static void pick_filter_intra_interframe(
+    const AV1_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+    BLOCK_SIZE bsize, int *rate_uv_intra, int *rate_uv_tokenonly,
+    int64_t *dist_uv, int *skip_uv, PREDICTION_MODE *mode_uv,
+    FILTER_INTRA_MODE_INFO *filter_intra_mode_info_uv,
+#if CONFIG_EXT_INTRA
+    int8_t *uv_angle_delta,
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+    PALETTE_MODE_INFO *pmi_uv, int palette_ctx,
+#endif  // CONFIG_PALETTE
+    int skip_mask, unsigned int *ref_costs_single, int64_t *best_rd,
+    int64_t *best_intra_rd, PREDICTION_MODE *best_intra_mode,
+    int *best_mode_index, int *best_skip2, int *best_mode_skippable,
+#if CONFIG_SUPERTX
+    int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+    int64_t *best_pred_rd, MB_MODE_INFO *best_mbmode, RD_COST *rd_cost) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_PALETTE
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#endif  // CONFIG_PALETTE
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
+  int dc_mode_index;
+  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
+  int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd, distortion_uv;
+  TX_SIZE uv_tx;
+
+  for (i = 0; i < MAX_MODES; ++i)
+    if (av1_mode_order[i].mode == DC_PRED &&
+        av1_mode_order[i].ref_frame[0] == INTRA_FRAME)
+      break;
+  dc_mode_index = i;
+  assert(i < MAX_MODES);
+
+  // TODO(huisu): use skip_mask for further speedup.
+  (void)skip_mask;
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  if (!rd_pick_filter_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                                &skippable, bsize, intra_mode_cost[mbmi->mode],
+                                &this_rd, 0)) {
+    return;
+  }
+  if (rate_y == INT_MAX) return;
+
+  uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
+                          [xd->plane[1].subsampling_y];
+  if (rate_uv_intra[uv_tx] == INT_MAX) {
+    choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+                         &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
+                         &skip_uv[uv_tx], &mode_uv[uv_tx]);
+#if CONFIG_PALETTE
+    if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
+#endif  // CONFIG_PALETTE
+    filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#if CONFIG_EXT_INTRA
+    uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
+  }
+
+  rate_uv = rate_uv_tokenonly[uv_tx];
+  distortion_uv = dist_uv[uv_tx];
+  skippable = skippable && skip_uv[uv_tx];
+  mbmi->uv_mode = mode_uv[uv_tx];
+#if CONFIG_PALETTE
+  if (cm->allow_screen_content_tools) {
+    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+  }
+#endif  // CONFIG_PALETTE
+#if CONFIG_EXT_INTRA
+  mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+#endif  // CONFIG_EXT_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+      filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+  if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+    mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+        filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+  }
+
+  rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+          cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+#if CONFIG_PALETTE
+  if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED)
+    rate2 += av1_cost_bit(
+        av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
+#endif  // CONFIG_PALETTE
+
+  if (!xd->lossless[mbmi->segment_id]) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rate_y -= cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                               [tx_size_to_depth(mbmi->tx_size)];
+  }
+
+  rate2 += av1_cost_bit(cm->fc->filter_intra_probs[0],
+                        mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
+  rate2 += write_uniform_cost(
+      FILTER_INTRA_MODES, mbmi->filter_intra_mode_info.filter_intra_mode[0]);
+#if CONFIG_EXT_INTRA
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+    rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
+  }
+#endif  // CONFIG_EXT_INTRA
+  if (mbmi->mode == DC_PRED) {
+    rate2 +=
+        av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
+                     mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
+    if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
+      rate2 +=
+          write_uniform_cost(FILTER_INTRA_MODES,
+                             mbmi->filter_intra_mode_info.filter_intra_mode[1]);
+  }
+  distortion2 = distortion_y + distortion_uv;
+  av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 0);
+#if CONFIG_AOM_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->recon_variance = av1_high_get_sby_perpixel_variance(
+        cpi, &xd->plane[0].dst, bsize, xd->bd);
+  } else {
+    x->recon_variance =
+        av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  x->recon_variance =
+      av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+  rate2 += ref_costs_single[INTRA_FRAME];
+
+  if (skippable) {
+    rate2 -= (rate_y + rate_uv);
+    rate_y = 0;
+    rate_uv = 0;
+    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+  } else {
+    rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+  }
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+
+  if (this_rd < *best_intra_rd) {
+    *best_intra_rd = this_rd;
+    *best_intra_mode = mbmi->mode;
+  }
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
+
+  if (this_rd < *best_rd) {
+    *best_mode_index = dc_mode_index;
+    mbmi->mv[0].as_int = 0;
+    rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+    if (x->skip)
+      *returnrate_nocoef = rate2;
+    else
+      *returnrate_nocoef = rate2 - rate_y - rate_uv;
+    *returnrate_nocoef -= av1_cost_bit(av1_get_skip_prob(cm, xd), skippable);
+    *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+                                       mbmi->ref_frame[0] != INTRA_FRAME);
+#endif  // CONFIG_SUPERTX
+    rd_cost->dist = distortion2;
+    rd_cost->rdcost = this_rd;
+    *best_rd = this_rd;
+    *best_mbmode = *mbmi;
+    *best_skip2 = 0;
+    *best_mode_skippable = skippable;
+  }
+}
+#endif  // CONFIG_FILTER_INTRA
+
 #if CONFIG_MOTION_VAR
 static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
                                       const MACROBLOCKD *xd, int mi_row,
@@ -4314,8 +8377,12 @@
 
 void av1_rd_pick_inter_mode_sb(const AV1_COMP *cpi, TileDataEnc *tile_data,
                                MACROBLOCK *x, int mi_row, int mi_col,
-                               RD_COST *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
+                               RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                               int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far) {
   const AV1_COMMON *const cm = &cpi->common;
   const RD_OPT *const rd_opt = &cpi->rd;
   const SPEED_FEATURES *const sf = &cpi->sf;
@@ -4330,12 +8397,18 @@
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i, k;
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
-  int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
-  InterpFilter single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
-  int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[REFS_PER_FRAME + 1] = {
+  int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
+#if CONFIG_EXT_INTER
+  int_mv single_newmvs[2][TOTAL_REFS_PER_FRAME] = { { { 0 } }, { { 0 } } };
+  int single_newmvs_rate[2][TOTAL_REFS_PER_FRAME] = { { 0 }, { 0 } };
+  int64_t modelled_rd[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+#else
+  int_mv single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
+#endif  // CONFIG_EXT_INTER
+  InterpFilter single_inter_filter[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  int single_skippable[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  static const int flag_list[TOTAL_REFS_PER_FRAME] = {
     0,
     AOM_LAST_FLAG,
 #if CONFIG_EXT_REFS
@@ -4349,6 +8422,7 @@
     AOM_ALT_FLAG
   };
   int64_t best_rd = best_rd_so_far;
+  int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
   MB_MODE_INFO best_mbmode;
@@ -4358,29 +8432,48 @@
 #endif
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
-  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
   aom_prob comp_mode_p;
   int64_t best_intra_rd = INT64_MAX;
   unsigned int best_pred_sse = UINT_MAX;
   PREDICTION_MODE best_intra_mode = DC_PRED;
   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
-  int64_t dist_uv[TX_SIZES];
-  int skip_uv[TX_SIZES];
+  int64_t dist_uvs[TX_SIZES];
+  int skip_uvs[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
 #if CONFIG_PALETTE
   PALETTE_MODE_INFO pmi_uv[TX_SIZES];
 #endif  // CONFIG_PALETTE
+#if CONFIG_EXT_INTRA
+  int8_t uv_angle_delta[TX_SIZES];
+  int is_directional_mode, angle_stats_ready = 0;
+  int rate_overhead, rate_dummy;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+  int8_t dc_skipped = 1;
+  FILTER_INTRA_MODE_INFO filter_intra_mode_info_uv[TX_SIZES];
+#endif  // CONFIG_FILTER_INTRA
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int *const intra_mode_cost = cpi->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
-  uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+#if CONFIG_EXT_INTER
+  uint32_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
+  MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
+  int64_t best_single_inter_rd = INT64_MAX;
+#else
+  uint16_t mode_skip_mask[TOTAL_REFS_PER_FRAME] = { 0 };
+#endif  // CONFIG_EXT_INTER
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
   int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
 #if CONFIG_PVQ
   od_rollback_buffer pre_buf;
 #endif
@@ -4388,18 +8481,12 @@
 #if CONFIG_PALETTE || CONFIG_EXT_INTRA
   const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
   const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
-#endif
+#endif  // CONFIG_PALETTE || CONFIG_EXT_INTRA
 #if CONFIG_PALETTE
   int palette_ctx = 0;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
 #endif  // CONFIG_PALETTE
-#if CONFIG_EXT_INTRA
-  int angle_stats_ready = 0;
-  int8_t uv_angle_delta[TX_SIZES];
-  uint8_t directional_mode_skip_mask[INTRA_MODES];
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-#endif  // CONFIG_EXT_INTRA
 #if CONFIG_MOTION_VAR
 #if CONFIG_AOM_HIGHBITDEPTH
   DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
@@ -4411,6 +8498,10 @@
   DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
   uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+  int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
   int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
 
@@ -4436,10 +8527,6 @@
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // CONFIG_MOTION_VAR
 
-#if CONFIG_EXT_INTRA
-  memset(directional_mode_skip_mask, 0,
-         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
-#endif  // CONFIG_EXT_INTRA
   av1_zero(best_mbmode);
 
 #if CONFIG_PALETTE
@@ -4452,31 +8539,52 @@
   }
 #endif  // CONFIG_PALETTE
 
+#if CONFIG_EXT_INTRA
+  memset(directional_mode_skip_mask, 0,
+         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#endif  // CONFIG_EXT_INTRA
+
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
   for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++) rate_uv_intra[i] = INT_MAX;
-  for (i = 0; i < MAX_REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
   for (i = 0; i < MB_MODE_COUNT; ++i) {
-    for (k = 0; k < MAX_REF_FRAMES; ++k) {
+    for (k = 0; k < TOTAL_REFS_PER_FRAME; ++k) {
       single_inter_filter[i][k] = SWITCHABLE;
       single_skippable[i][k] = 0;
     }
   }
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_GLOBAL_MOTION
+    frame_mv[ZEROMV][ref_frame].as_int =
+        gm_get_motion_vector(&cm->global_motion[ref_frame]).as_int;
+#else   // CONFIG_GLOBAL_MOTION
     frame_mv[ZEROMV][ref_frame].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_EXT_INTER
+    frame_mv[NEWFROMNEARMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
+#endif  // CONFIG_EXT_INTER
   }
 
 #if CONFIG_REF_MV
@@ -4485,26 +8593,20 @@
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     x->mbmi_ext->mode_context[ref_frame] = 0;
     av1_find_mv_refs(cm, xd, mi, ref_frame, &mbmi_ext->ref_mv_count[ref_frame],
-                     mbmi_ext->ref_mv_stack[ref_frame], candidates, mi_row,
-                     mi_col, NULL, NULL, mbmi_ext->mode_context);
-
-    if (mbmi_ext->ref_mv_count[ref_frame] < 2) {
-      MV_REFERENCE_FRAME rf[2];
-      av1_set_ref_frame(rf, ref_frame);
-      if (mbmi_ext->ref_mvs[rf[0]][0].as_int != 0 ||
-          mbmi_ext->ref_mvs[rf[0]][1].as_int != 0 ||
-          mbmi_ext->ref_mvs[rf[1]][0].as_int != 0 ||
-          mbmi_ext->ref_mvs[rf[1]][1].as_int != 0)
-        mbmi_ext->mode_context[ref_frame] &= ~(1 << ALL_ZERO_FLAG_OFFSET);
-    }
+                     mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                     mbmi_ext->compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+                     candidates, mi_row, mi_col, NULL, NULL,
+                     mbmi_ext->mode_context);
   }
-#endif
+#endif  // CONFIG_REF_MV
 
 #if CONFIG_MOTION_VAR
   av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
-                                      dst_stride1);
+                                      dst_width1, dst_height1, dst_stride1);
   av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
-                                     dst_stride2);
+                                     dst_width2, dst_height2, dst_stride2);
   av1_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
   x->mask_buf = mask2d_buf;
   x->wsrc_buf = weighted_src_buf;
@@ -4555,6 +8657,7 @@
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+      int_mv zeromv;
       ref_frame_skip_mask[0] = (1 << LAST_FRAME) |
 #if CONFIG_EXT_REFS
                                (1 << LAST2_FRAME) | (1 << LAST3_FRAME) |
@@ -4562,16 +8665,35 @@
 #endif  // CONFIG_EXT_REFS
                                (1 << GOLDEN_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      // TODO(zoeliu): To further explore whether following needs to be done for
+      //               BWDREF_FRAME as well.
       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
-      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
+#if CONFIG_GLOBAL_MOTION
+      zeromv.as_int =
+          gm_get_motion_vector(&cm->global_motion[ALTREF_FRAME]).as_int;
+#else
+      zeromv.as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+      if (frame_mv[NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
-      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
+      if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+#if CONFIG_EXT_INTER
+      if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
+      if (frame_mv[NEAREST_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARMV);
+      if (frame_mv[NEAR_NEARESTMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARESTMV);
+      if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != zeromv.as_int)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
+#endif  // CONFIG_EXT_INTER
     }
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
     if (sf->alt_ref_search_fp) {
+      assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
       mode_skip_mask[ALTREF_FRAME] = 0;
       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
@@ -4616,15 +8738,34 @@
     midx = end_pos;
   }
 
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+    x->use_default_inter_tx_type = 1;
+  else
+    x->use_default_inter_tx_type = 0;
 #if CONFIG_PVQ
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
 #endif
+#if CONFIG_EXT_INTER
+  for (i = 0; i < MB_MODE_COUNT; ++i)
+    for (ref_frame = 0; ref_frame < TOTAL_REFS_PER_FRAME; ++ref_frame)
+      modelled_rd[i][ref_frame] = INT64_MAX;
+#endif  // CONFIG_EXT_INTER
+
   for (midx = 0; midx < MAX_MODES; ++midx) {
-    int mode_index = mode_map[midx];
+    int mode_index;
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
     int compmode_cost = 0;
+#if CONFIG_EXT_INTER
+    int compmode_interintra_cost = 0;
+    int compmode_wedge_cost = 0;
+#endif  // CONFIG_EXT_INTER
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
@@ -4636,13 +8777,30 @@
 #if CONFIG_PVQ
     od_encode_rollback(&x->daala_enc, &pre_buf);
 #endif
+    mode_index = mode_map[midx];
     this_mode = av1_mode_order[mode_index].mode;
     ref_frame = av1_mode_order[mode_index].ref_frame[0];
     second_ref_frame = av1_mode_order[mode_index].ref_frame[1];
-
 #if CONFIG_REF_MV
     mbmi->ref_mv_idx = 0;
 #endif
+
+#if CONFIG_EXT_INTER
+    if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
+      // Mode must by compatible
+      assert(is_interintra_allowed_mode(this_mode));
+
+      if (!is_interintra_allowed_bsize(bsize)) continue;
+    }
+
+    if (is_inter_compound_mode(this_mode)) {
+      frame_mv[this_mode][ref_frame].as_int =
+          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
+      frame_mv[this_mode][second_ref_frame].as_int =
+          frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
+    }
+#endif  // CONFIG_EXT_INTER
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (midx == mode_skip_start && best_mode_index >= 0) {
@@ -4678,7 +8836,9 @@
 #endif  // CONFIG_EXT_REFS
           break;
         case NONE:
-        case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
+        case TOTAL_REFS_PER_FRAME:
+          assert(0 && "Invalid Reference frame");
+          break;
       }
     }
 
@@ -4739,10 +8899,20 @@
           if (conditional_skipintra(this_mode, best_intra_mode)) continue;
         }
       }
+#if CONFIG_GLOBAL_MOTION
+    } else if (get_gmtype(&cm->global_motion[ref_frame]) == GLOBAL_ZERO &&
+               (!comp_pred ||
+                get_gmtype(&cm->global_motion[second_ref_frame]) ==
+                    GLOBAL_ZERO)) {
+#else   // CONFIG_GLOBAL_MOTION
     } else {
+#endif  // CONFIG_GLOBAL_MOTION
       const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, second_ref_frame };
-      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv, this_mode,
-                              ref_frames, bsize, -1))
+      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                              mbmi_ext->compound_mode_context,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                              frame_mv, this_mode, ref_frames, bsize, -1))
         continue;
     }
 
@@ -4754,18 +8924,24 @@
     pmi->palette_size[0] = 0;
     pmi->palette_size[1] = 0;
 #endif  // CONFIG_PALETTE
-    // Evaluate all sub-pel filters irrespective of whether we can use
-    // them for this frame.
+#if CONFIG_FILTER_INTRA
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+        // Evaluate all sub-pel filters irrespective of whether we can use
+        // them for this frame.
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i) {
+      mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+    }
+#else
     mbmi->interp_filter =
-        cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
+        cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR : cm->interp_filter;
+#endif
     mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
-
-#if CONFIG_MOTION_VAR
     mbmi->motion_mode = SIMPLE_TRANSLATION;
-#endif  // CONFIG_MOTION_VAR
-#if CONFIG_EXT_INTRA
-    mbmi->intra_angle_delta[0] = 0;
-#endif  // CONFIG_EXT_INTRA
 
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
@@ -4776,12 +8952,16 @@
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
+#if CONFIG_EXT_INTER
+    mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+#endif  // CONFIG_EXT_INTER
+
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
 #if CONFIG_EXT_INTRA
-      if (is_directional_mode(mbmi->mode)) {
-        int rate_dummy;
+      is_directional_mode = (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED);
+      if (is_directional_mode) {
         if (!angle_stats_ready) {
           const int src_stride = x->plane[0].src.stride;
           const uint8_t *src = x->plane[0].src.buf;
@@ -4796,12 +8976,14 @@
           angle_stats_ready = 1;
         }
         if (directional_mode_skip_mask[mbmi->mode]) continue;
+        rate_overhead = write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0) +
+                        intra_mode_cost[mbmi->mode];
         rate_y = INT_MAX;
-        this_rd = rd_pick_intra_angle_sby(
-            cpi, x, &rate_dummy, &rate_y, &distortion_y, &skippable, bsize,
-            cpi->mbmode_cost[mbmi->mode], best_rd);
+        this_rd =
+            rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                                    &skippable, bsize, rate_overhead, best_rd);
       } else {
-        mbmi->intra_angle_delta[0] = 0;
+        mbmi->angle_delta[0] = 0;
         super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
                         best_rd);
       }
@@ -4809,26 +8991,34 @@
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
                       best_rd);
 #endif  // CONFIG_EXT_INTRA
+
       if (rate_y == INT_MAX) continue;
 
-      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
-                                  pd->subsampling_y);
+#if CONFIG_FILTER_INTRA
+      if (mbmi->mode == DC_PRED) dc_skipped = 0;
+#endif  // CONFIG_FILTER_INTRA
+
+      uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][pd->subsampling_x]
+                              [pd->subsampling_y];
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                             &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
-                             &skip_uv[uv_tx], &mode_uv[uv_tx]);
+        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+                             &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
+                             &skip_uvs[uv_tx], &mode_uv[uv_tx]);
 #if CONFIG_PALETTE
         if (cm->allow_screen_content_tools) pmi_uv[uv_tx] = *pmi;
 #endif  // CONFIG_PALETTE
 
 #if CONFIG_EXT_INTRA
-        uv_angle_delta[uv_tx] = mbmi->intra_angle_delta[1];
+        uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+        filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#endif  // CONFIG_FILTER_INTRA
       }
 
       rate_uv = rate_uv_tokenonly[uv_tx];
-      distortion_uv = dist_uv[uv_tx];
-      skippable = skippable && skip_uv[uv_tx];
+      distortion_uv = dist_uvs[uv_tx];
+      skippable = skippable && skip_uvs[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
 #if CONFIG_PALETTE
       if (cm->allow_screen_content_tools) {
@@ -4840,33 +9030,112 @@
 #endif  // CONFIG_PALETTE
 
 #if CONFIG_EXT_INTRA
-      mbmi->intra_angle_delta[1] = uv_angle_delta[uv_tx];
+      mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+          filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+      if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+        mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+            filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+      }
+#endif  // CONFIG_FILTER_INTRA
 
-      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+              cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
 #if CONFIG_PALETTE
       if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED)
         rate2 += av1_cost_bit(
             av1_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx], 0);
 #endif  // CONFIG_PALETTE
-      if (this_mode != DC_PRED && this_mode != TM_PRED)
-        rate2 += intra_cost_penalty;
+
+      if (!xd->lossless[mbmi->segment_id]) {
+        // super_block_yrd above includes the cost of the tx_size in the
+        // tokenonly rate, but for intra blocks, tx_size is always coded
+        // (prediction granularity), so we account for it in the full rate,
+        // not the tokenonly rate.
+        rate_y -=
+            cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                             [tx_size_to_depth(mbmi->tx_size)];
+      }
 #if CONFIG_EXT_INTRA
-      if (is_directional_mode(mbmi->mode)) {
-        const int max_angle_delta =
-            av1_max_angle_delta_y[max_tx_size][mbmi->mode];
-        rate2 +=
-            write_uniform_cost(2 * max_angle_delta + 1,
-                               max_angle_delta + mbmi->intra_angle_delta[0]);
+      if (is_directional_mode) {
+        int p_angle;
+        const int intra_filter_ctx = av1_get_pred_context_intra_interp(xd);
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                    MAX_ANGLE_DELTAS + mbmi->angle_delta[0]);
+        p_angle =
+            mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+        if (av1_is_intra_filter_switchable(p_angle))
+          rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+      }
+      if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                    MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
       }
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      if (mbmi->mode == DC_PRED) {
+        rate2 +=
+            av1_cost_bit(cm->fc->filter_intra_probs[0],
+                         mbmi->filter_intra_mode_info.use_filter_intra_mode[0]);
+        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[0]) {
+          rate2 += write_uniform_cost(
+              FILTER_INTRA_MODES,
+              mbmi->filter_intra_mode_info.filter_intra_mode[0]);
+        }
+      }
+      if (mbmi->uv_mode == DC_PRED) {
+        rate2 +=
+            av1_cost_bit(cpi->common.fc->filter_intra_probs[1],
+                         mbmi->filter_intra_mode_info.use_filter_intra_mode[1]);
+        if (mbmi->filter_intra_mode_info.use_filter_intra_mode[1])
+          rate2 += write_uniform_cost(
+              FILTER_INTRA_MODES,
+              mbmi->filter_intra_mode_info.filter_intra_mode[1]);
+      }
+#endif  // CONFIG_FILTER_INTRA
+      if (this_mode != DC_PRED && this_mode != TM_PRED)
+        rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
+      av1_encode_intra_block_plane((AV1_COMMON *)cm, x, bsize, 0, 1);
+#if CONFIG_AOM_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        x->recon_variance = av1_high_get_sby_perpixel_variance(
+            cpi, &xd->plane[0].dst, bsize, xd->bd);
+      } else {
+        x->recon_variance =
+            av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+      }
+#else
+      x->recon_variance =
+          av1_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_AOM_HIGHBITDEPTH
     } else {
 #if CONFIG_REF_MV
       int_mv backup_ref_mv[2];
+
       backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
       if (comp_pred) backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
-
+#endif
+#if CONFIG_EXT_INTER
+      if (second_ref_frame == INTRA_FRAME) {
+        if (best_single_inter_ref != ref_frame) continue;
+        mbmi->interintra_mode = intra_to_interintra_mode[best_intra_mode];
+// TODO(debargha|geza.lore):
+// Should we use ext_intra modes for interintra?
+#if CONFIG_EXT_INTRA
+        mbmi->angle_delta[0] = 0;
+        mbmi->angle_delta[1] = 0;
+        mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+        mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+        mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+      }
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
       mbmi->ref_mv_idx = 0;
       ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
 
@@ -4877,21 +9146,27 @@
               (ref == 0) ? mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv
                          : mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
           clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
-          lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
           mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
         }
       }
 #endif
-      this_rd = handle_inter_mode(cpi, x, bsize, &rate2, &distortion2,
-                                  &skippable, &rate_y, &rate_uv, &disable_skip,
-                                  frame_mv, mi_row, mi_col,
+      this_rd = handle_inter_mode(
+          cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
+          &disable_skip, frame_mv, mi_row, mi_col,
 #if CONFIG_MOTION_VAR
-                                  dst_buf1, dst_stride1, dst_buf2, dst_stride2,
+          dst_buf1, dst_stride1, dst_buf2, dst_stride2,
 #endif  // CONFIG_MOTION_VAR
-                                  single_newmv, single_inter_filter,
-                                  single_skippable, &total_sse, best_rd);
+#if CONFIG_EXT_INTER
+          single_newmvs, single_newmvs_rate, &compmode_interintra_cost,
+          &compmode_wedge_cost, modelled_rd,
+#else
+          single_newmv,
+#endif  // CONFIG_EXT_INTER
+          single_inter_filter, single_skippable, &total_sse, best_rd);
 
 #if CONFIG_REF_MV
+      // TODO(jingning): This needs some refactoring to improve code quality
+      // and reduce redundant steps.
       if ((mbmi->mode == NEARMV &&
            mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
           (mbmi->mode == NEWMV && mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
@@ -4900,31 +9175,40 @@
         int backup_skip = x->skip;
         int64_t tmp_ref_rd = this_rd;
         int ref_idx;
+
         // TODO(jingning): This should be deprecated shortly.
         int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
         int ref_set =
             AOMMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
+
         uint8_t drl_ctx =
             av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx_offset);
-
         // Dummy
         int_mv backup_fmv[2];
         backup_fmv[0] = frame_mv[NEWMV][ref_frame];
         if (comp_pred) backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
 
-        rate2 += cpi->drl_mode_cost[drl_ctx][0];
+        rate2 += (rate2 < INT_MAX ? cpi->drl_mode_cost0[drl_ctx][0] : 0);
 
         if (this_rd < INT64_MAX) {
-          if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + rate_skip0,
-                     distortion2) <
-              RDCOST(x->rdmult, x->rddiv, rate_skip1, total_sse))
+          if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+              RDCOST(x->rdmult, x->rddiv, 0, total_sse))
             tmp_ref_rd =
-                RDCOST(x->rdmult, x->rddiv, rate2 + rate_skip0, distortion2);
+                RDCOST(x->rdmult, x->rddiv,
+                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                       distortion2);
           else
             tmp_ref_rd =
                 RDCOST(x->rdmult, x->rddiv,
-                       rate2 + rate_skip1 - rate_y - rate_uv, total_sse);
+                       rate2 + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                           rate_y - rate_uv,
+                       total_sse);
         }
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
 
         for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
           int64_t tmp_alt_rd = INT64_MAX;
@@ -4945,19 +9229,28 @@
                     : mbmi_ext->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx]
                           .comp_mv;
             clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
-            lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
             mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
           }
 
           cur_mv =
               mbmi_ext->ref_mv_stack[ref_frame][mbmi->ref_mv_idx + idx_offset]
                   .this_mv;
-          lower_mv_precision(&cur_mv.as_mv, cm->allow_high_precision_mv);
           clamp_mv2(&cur_mv.as_mv, xd);
 
           if (!mv_check_bounds(x, &cur_mv.as_mv)) {
-            int dummy_single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-            int_mv dummy_single_newmv[MAX_REF_FRAMES] = { { 0 } };
+            int dummy_single_skippable[MB_MODE_COUNT]
+                                      [TOTAL_REFS_PER_FRAME] = { { 0 } };
+#if CONFIG_EXT_INTER
+            int_mv dummy_single_newmvs[2][TOTAL_REFS_PER_FRAME] = { { { 0 } },
+                                                                    { { 0 } } };
+            int dummy_single_newmvs_rate[2][TOTAL_REFS_PER_FRAME] = { { 0 },
+                                                                      { 0 } };
+            int dummy_compmode_interintra_cost = 0;
+            int dummy_compmode_wedge_cost = 0;
+#else
+            int_mv dummy_single_newmv[TOTAL_REFS_PER_FRAME] = { { 0 } };
+#endif
+
             frame_mv[NEARMV][ref_frame] = cur_mv;
             tmp_alt_rd = handle_inter_mode(
                 cpi, x, bsize, &tmp_rate, &tmp_dist, &tmp_skip, &tmp_rate_y,
@@ -4965,15 +9258,22 @@
 #if CONFIG_MOTION_VAR
                 dst_buf1, dst_stride1, dst_buf2, dst_stride2,
 #endif  // CONFIG_MOTION_VAR
-                dummy_single_newmv, single_inter_filter, dummy_single_skippable,
-                &tmp_sse, best_rd);
+#if CONFIG_EXT_INTER
+                dummy_single_newmvs, dummy_single_newmvs_rate,
+                &dummy_compmode_interintra_cost, &dummy_compmode_wedge_cost,
+                NULL,
+#else
+                dummy_single_newmv,
+#endif
+                single_inter_filter, dummy_single_skippable, &tmp_sse, best_rd);
           }
 
           for (i = 0; i < mbmi->ref_mv_idx; ++i) {
             uint8_t drl1_ctx = 0;
             drl1_ctx = av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
                                    i + idx_offset);
-            tmp_rate += cpi->drl_mode_cost[drl1_ctx][1];
+            tmp_rate +=
+                (tmp_rate < INT_MAX ? cpi->drl_mode_cost0[drl1_ctx][1] : 0);
           }
 
           if (mbmi_ext->ref_mv_count[ref_frame_type] >
@@ -4982,51 +9282,63 @@
             uint8_t drl1_ctx =
                 av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
                             mbmi->ref_mv_idx + idx_offset);
-            tmp_rate += cpi->drl_mode_cost[drl1_ctx][0];
+            tmp_rate += cpi->drl_mode_cost0[drl1_ctx][0];
           }
 
           if (tmp_alt_rd < INT64_MAX) {
 #if CONFIG_MOTION_VAR
             tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rate, tmp_dist);
 #else
-            if (RDCOST(x->rdmult, x->rddiv,
-                       tmp_rate_y + tmp_rate_uv + rate_skip0, tmp_dist) <
-                RDCOST(x->rdmult, x->rddiv, rate_skip1, tmp_sse))
+            if (RDCOST(x->rdmult, x->rddiv, tmp_rate_y + tmp_rate_uv,
+                       tmp_dist) < RDCOST(x->rdmult, x->rddiv, 0, tmp_sse))
               tmp_alt_rd =
-                  RDCOST(x->rdmult, x->rddiv, tmp_rate + rate_skip0, tmp_dist);
+                  RDCOST(x->rdmult, x->rddiv,
+                         tmp_rate + av1_cost_bit(av1_get_skip_prob(cm, xd), 0),
+                         tmp_dist);
             else
-              tmp_alt_rd = RDCOST(
-                  x->rdmult, x->rddiv,
-                  tmp_rate + rate_skip1 - tmp_rate_y - tmp_rate_uv, tmp_sse);
+              tmp_alt_rd =
+                  RDCOST(x->rdmult, x->rddiv,
+                         tmp_rate + av1_cost_bit(av1_get_skip_prob(cm, xd), 1) -
+                             tmp_rate_y - tmp_rate_uv,
+                         tmp_sse);
 #endif  // CONFIG_MOTION_VAR
           }
 
           if (tmp_ref_rd > tmp_alt_rd) {
             rate2 = tmp_rate;
+            disable_skip = dummy_disable_skip;
             distortion2 = tmp_dist;
             skippable = tmp_skip;
-            disable_skip = dummy_disable_skip;
             rate_y = tmp_rate_y;
             rate_uv = tmp_rate_uv;
             total_sse = tmp_sse;
             this_rd = tmp_alt_rd;
-            mbmi->ref_mv_idx = 1 + ref_idx;
             tmp_ref_rd = tmp_alt_rd;
             backup_mbmi = *mbmi;
             backup_skip = x->skip;
+#if CONFIG_VAR_TX
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                     sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
           } else {
             *mbmi = backup_mbmi;
             x->skip = backup_skip;
           }
         }
+
         frame_mv[NEARMV][ref_frame] = backup_mv;
         frame_mv[NEWMV][ref_frame] = backup_fmv[0];
         if (comp_pred) frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip[i], x->blk_skip_drl[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
       }
-
       mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
       if (comp_pred) mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
-#endif
+#endif  // CONFIG_REF_MV
 
       if (this_rd == INT64_MAX) continue;
 
@@ -5035,10 +9347,22 @@
       if (cm->reference_mode == REFERENCE_MODE_SELECT) rate2 += compmode_cost;
     }
 
+#if CONFIG_EXT_INTER
+    rate2 += compmode_interintra_cost;
+    if (cm->reference_mode != SINGLE_REFERENCE && comp_pred)
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+      if (mbmi->motion_mode == SIMPLE_TRANSLATION)
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+        rate2 += compmode_wedge_cost;
+#endif  // CONFIG_EXT_INTER
+
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
     if (comp_pred) {
       rate2 += ref_costs_comp[ref_frame];
+#if CONFIG_EXT_REFS
+      rate2 += ref_costs_comp[second_ref_frame];
+#endif  // CONFIG_EXT_REFS
     } else {
       rate2 += ref_costs_single[ref_frame];
     }
@@ -5051,7 +9375,8 @@
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
-
+        rate_y = 0;
+        rate_uv = 0;
         // Cost the skip mb case
         rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
@@ -5072,6 +9397,8 @@
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
           this_skip2 = 1;
+          rate_y = 0;
+          rate_uv = 0;
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -5084,6 +9411,10 @@
     } else {
       this_skip2 = mbmi->skip;
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      if (this_skip2) {
+        rate_y = 0;
+        rate_uv = 0;
+      }
 #endif  // CONFIG_MOTION_VAR
     }
 
@@ -5093,6 +9424,13 @@
         best_intra_rd = this_rd;
         best_intra_mode = mbmi->mode;
       }
+#if CONFIG_EXT_INTER
+    } else if (second_ref_frame == NONE) {
+      if (this_rd < best_single_inter_rd) {
+        best_single_inter_rd = this_rd;
+        best_single_inter_ref = mbmi->ref_frame[0];
+      }
+#endif  // CONFIG_EXT_INTER
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
@@ -5114,12 +9452,35 @@
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        if (x->skip)
+          *returnrate_nocoef = rate2;
+        else
+          *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        *returnrate_nocoef -= av1_cost_bit(
+            av1_get_skip_prob(cm, xd), disable_skip || skippable || this_skip2);
+        *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+                                           mbmi->ref_frame[0] != INTRA_FRAME);
+#if CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+        if (is_inter_block(mbmi) && is_motion_variation_allowed(mbmi))
+          *returnrate_nocoef -= cpi->motion_mode_cost[bsize][mbmi->motion_mode];
+#endif  // CONFIG_MOTION_VAR || CONFIG_WARPED_MOTION
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         best_mode_skippable = skippable;
+        best_rate_y = rate_y + av1_cost_bit(av1_get_skip_prob(cm, xd),
+                                            this_skip2 || skippable);
+        best_rate_uv = rate_uv;
+
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(ctx->blk_skip[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
       }
     }
 
@@ -5152,13 +9513,126 @@
     if (x->skip && !comp_pred) break;
   }
 
+  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+        is_inter_mode(best_mbmode.mode)) ||
+       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+        !is_inter_mode(best_mbmode.mode)))) {
+    int rate_y = 0, rate_uv = 0;
+    int64_t dist_y = 0, dist_uv = 0;
+    int skip_y = 0, skip_uv = 0, skip_blk = 0;
+    int64_t sse_y = 0, sse_uv = 0;
+
+    x->use_default_inter_tx_type = 0;
+    x->use_default_intra_tx_type = 0;
+
+    *mbmi = best_mbmode;
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (has_second_ref(mbmi))
+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    if (is_inter_mode(mbmi->mode)) {
+#if CONFIG_VAR_TX
+      RD_STATS rd_stats_uv;
+#endif
+      av1_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#if CONFIG_MOTION_VAR
+      if (mbmi->motion_mode == OBMC_CAUSAL)
+        av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1,
+                                        dst_stride1, dst_buf2, dst_stride2);
+#endif  // CONFIG_MOTION_VAR
+      av1_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+        RD_STATS rd_stats_y;
+        select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+        rate_y = rd_stats_y.rate;
+        dist_y = rd_stats_y.dist;
+        sse_y = rd_stats_y.sse;
+        skip_y = rd_stats_y.skip;
+      } else {
+        int idx, idy;
+        super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y, bsize,
+                        INT64_MAX);
+        for (idy = 0; idy < xd->n8_h; ++idy)
+          for (idx = 0; idx < xd->n8_w; ++idx)
+            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+        memset(x->blk_skip[0], skip_y,
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+      }
+
+      inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+      rate_uv = rd_stats_uv.rate;
+      dist_uv = rd_stats_uv.dist;
+      skip_uv = rd_stats_uv.skip;
+      sse_uv = rd_stats_uv.sse;
+#else
+      super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y, bsize,
+                      INT64_MAX);
+      super_block_uvrd(cpi, x, &rate_uv, &dist_uv, &skip_uv, &sse_uv, bsize,
+                       INT64_MAX);
+#endif  // CONFIG_VAR_TX
+    } else {
+      super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y, bsize,
+                      INT64_MAX);
+      super_block_uvrd(cpi, x, &rate_uv, &dist_uv, &skip_uv, &sse_uv, bsize,
+                       INT64_MAX);
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, (dist_y + dist_uv)) >
+        RDCOST(x->rdmult, x->rddiv, 0, (sse_y + sse_uv))) {
+      skip_blk = 1;
+      rate_y = av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
+      rate_uv = 0;
+      dist_y = sse_y;
+      dist_uv = sse_uv;
+    } else {
+      skip_blk = 0;
+      rate_y += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv, best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, (dist_y + dist_uv))) {
+#if CONFIG_VAR_TX
+      int idx, idy;
+#endif
+      best_mbmode.tx_type = mbmi->tx_type;
+      best_mbmode.tx_size = mbmi->tx_size;
+#if CONFIG_VAR_TX
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+
+      for (i = 0; i < MAX_MB_PLANE; ++i)
+        memcpy(ctx->blk_skip[i], x->blk_skip[i],
+               sizeof(uint8_t) * ctx->num_4x4_blk);
+
+      best_mbmode.min_tx_size = mbmi->min_tx_size;
+#endif
+      rd_cost->rate += (rate_y + rate_uv - best_rate_y - best_rate_uv);
+      rd_cost->dist = dist_y + dist_uv;
+      rd_cost->rdcost =
+          RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+      best_skip2 = skip_blk;
+    }
+  }
+
 #if CONFIG_PALETTE
   // Only try palette mode when the best mode so far is an intra mode.
   if (cm->allow_screen_content_tools && !is_inter_mode(best_mbmode.mode)) {
     PREDICTION_MODE mode_selected;
     int rate2 = 0, rate_y = 0;
+#if CONFIG_SUPERTX
+    int best_rate_nocoef;
+#endif
     int64_t distortion2 = 0, distortion_y = 0, dummy_rd = best_rd, this_rd;
-    int skippable = 0, rate_overhead = 0;
+    int skippable = 0, rate_overhead_palette = 0;
     TX_SIZE best_tx_size, uv_tx;
     TX_TYPE best_tx_type;
     PALETTE_MODE_INFO palette_mode_info;
@@ -5171,8 +9645,8 @@
     mbmi->ref_frame[0] = INTRA_FRAME;
     mbmi->ref_frame[1] = NONE;
     palette_mode_info.palette_size[0] = 0;
-    rate_overhead = rd_pick_palette_intra_sby(
-        cpi, x, bsize, palette_ctx, cpi->mbmode_cost[DC_PRED],
+    rate_overhead_palette = rd_pick_palette_intra_sby(
+        cpi, x, bsize, palette_ctx, intra_mode_cost[DC_PRED],
         &palette_mode_info, best_palette_color_map, &best_tx_size,
         &best_tx_type, &mode_selected, &dummy_rd);
     if (palette_mode_info.palette_size[0] == 0) goto PALETTE_EXIT;
@@ -5187,17 +9661,19 @@
     super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
                     best_rd);
     if (rate_y == INT_MAX) goto PALETTE_EXIT;
-    uv_tx =
-        get_uv_tx_size_impl(mbmi->tx_size, bsize, xd->plane[1].subsampling_x,
-                            xd->plane[1].subsampling_y);
+    uv_tx = uv_txsize_lookup[bsize][mbmi->tx_size][xd->plane[1].subsampling_x]
+                            [xd->plane[1].subsampling_y];
     if (rate_uv_intra[uv_tx] == INT_MAX) {
-      choose_intra_uv_mode(cpi, x, bsize, uv_tx, &rate_uv_intra[uv_tx],
-                           &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
-                           &skip_uv[uv_tx], &mode_uv[uv_tx]);
+      choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
+                           &rate_uv_tokenonly[uv_tx], &dist_uvs[uv_tx],
+                           &skip_uvs[uv_tx], &mode_uv[uv_tx]);
       pmi_uv[uv_tx] = *pmi;
 #if CONFIG_EXT_INTRA
-      uv_angle_delta[uv_tx] = mbmi->intra_angle_delta[1];
+      uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
 #endif  // CONFIG_EXT_INTRA
+#if CONFIG_FILTER_INTRA
+      filter_intra_mode_info_uv[uv_tx] = mbmi->filter_intra_mode_info;
+#endif  // CONFIG_FILTER_INTRA
     }
     mbmi->uv_mode = mode_uv[uv_tx];
     pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
@@ -5206,17 +9682,31 @@
              pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
              2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
 #if CONFIG_EXT_INTRA
-    mbmi->intra_angle_delta[1] = uv_angle_delta[uv_tx];
+    mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
 #endif  // CONFIG_EXT_INTRA
-    skippable = skippable && skip_uv[uv_tx];
-    distortion2 = distortion_y + dist_uv[uv_tx];
-    rate2 = rate_y + rate_overhead + rate_uv_intra[uv_tx];
+#if CONFIG_FILTER_INTRA
+    mbmi->filter_intra_mode_info.use_filter_intra_mode[1] =
+        filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1];
+    if (filter_intra_mode_info_uv[uv_tx].use_filter_intra_mode[1]) {
+      mbmi->filter_intra_mode_info.filter_intra_mode[1] =
+          filter_intra_mode_info_uv[uv_tx].filter_intra_mode[1];
+    }
+#endif  // CONFIG_FILTER_INTRA
+    skippable = skippable && skip_uvs[uv_tx];
+    distortion2 = distortion_y + dist_uvs[uv_tx];
+    rate2 = rate_y + rate_overhead_palette + rate_uv_intra[uv_tx];
     rate2 += ref_costs_single[INTRA_FRAME];
 
     if (skippable) {
       rate2 -= (rate_y + rate_uv_tokenonly[uv_tx]);
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2;
+#endif
       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 1);
     } else {
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2 - (rate_y + rate_uv_tokenonly[uv_tx]);
+#endif
       rate2 += av1_cost_bit(av1_get_skip_prob(cm, xd), 0);
     }
     this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@@ -5224,6 +9714,9 @@
       best_mode_index = 3;
       mbmi->mv[0].as_int = 0;
       rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+      *returnrate_nocoef = best_rate_nocoef;
+#endif
       rd_cost->dist = distortion2;
       rd_cost->rdcost = this_rd;
       best_rd = this_rd;
@@ -5235,16 +9728,60 @@
 PALETTE_EXIT:
 #endif  // CONFIG_PALETTE
 
+#if CONFIG_FILTER_INTRA
+  // TODO(huisu): filter-intra is turned off in lossless mode for now to
+  // avoid a unit test failure
+  if (!xd->lossless[mbmi->segment_id] &&
+#if CONFIG_PALETTE
+      mbmi->palette_mode_info.palette_size[0] == 0 &&
+#endif  // CONFIG_PALETTE
+      !dc_skipped && best_mode_index >= 0 &&
+      best_intra_rd < (best_rd + (best_rd >> 3))) {
+    pick_filter_intra_interframe(
+        cpi, x, ctx, bsize, rate_uv_intra, rate_uv_tokenonly, dist_uvs,
+        skip_uvs, mode_uv, filter_intra_mode_info_uv,
+#if CONFIG_EXT_INTRA
+        uv_angle_delta,
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_PALETTE
+        pmi_uv, palette_ctx,
+#endif  // CONFIG_PALETTE
+        0, ref_costs_single, &best_rd, &best_intra_rd, &best_intra_mode,
+        &best_mode_index, &best_skip2, &best_mode_skippable,
+#if CONFIG_SUPERTX
+        returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+        best_pred_rd, &best_mbmode, rd_cost);
+  }
+#endif  // CONFIG_FILTER_INTRA
+
   // The inter modes' rate costs are not calculated precisely in some cases.
   // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
   // ZEROMV. Here, checks are added for those cases, and the mode decisions
   // are corrected.
-  if (best_mbmode.mode == NEWMV) {
+  if (best_mbmode.mode == NEWMV
+#if CONFIG_EXT_INTER
+      || best_mbmode.mode == NEWFROMNEARMV || best_mbmode.mode == NEW_NEWMV
+#endif  // CONFIG_EXT_INTER
+      ) {
     const MV_REFERENCE_FRAME refs[2] = { best_mbmode.ref_frame[0],
                                          best_mbmode.ref_frame[1] };
     int comp_pred_mode = refs[1] > INTRA_FRAME;
+    int_mv zeromv[2];
 #if CONFIG_REF_MV
     const uint8_t rf_type = av1_ref_frame_type(best_mbmode.ref_frame);
+#endif  // CONFIG_REF_MV
+#if CONFIG_GLOBAL_MOTION
+    zeromv[0].as_int = gm_get_motion_vector(&cm->global_motion[refs[0]]).as_int;
+    zeromv[1].as_int =
+        comp_pred_mode
+            ? gm_get_motion_vector(&cm->global_motion[refs[1]]).as_int
+            : 0;
+#else
+    zeromv[0].as_int = 0;
+    zeromv[1].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
+#if CONFIG_REF_MV
     if (!comp_pred_mode) {
       int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
                         ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
@@ -5252,7 +9789,6 @@
 
       for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
         int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
-        lower_mv_precision(&cur_mv.as_mv, cm->allow_high_precision_mv);
         if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
           best_mbmode.mode = NEARMV;
           best_mbmode.ref_mv_idx = i;
@@ -5261,16 +9797,21 @@
 
       if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
         best_mbmode.mode = NEARESTMV;
-      else if (best_mbmode.mv[0].as_int == 0)
+      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int)
         best_mbmode.mode = ZEROMV;
     } else {
-      const int allow_hp = cm->allow_high_precision_mv;
-      int_mv nearestmv[2] = { frame_mv[NEARESTMV][refs[0]],
-                              frame_mv[NEARESTMV][refs[1]] };
+      int_mv nearestmv[2];
+      int_mv nearmv[2];
 
-      int_mv nearmv[2] = { frame_mv[NEARMV][refs[0]],
-                           frame_mv[NEARMV][refs[1]] };
-
+#if CONFIG_EXT_INTER
+      if (mbmi_ext->ref_mv_count[rf_type] > 1) {
+        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
+        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
+      } else {
+        nearmv[0] = frame_mv[NEARMV][refs[0]];
+        nearmv[1] = frame_mv[NEARMV][refs[1]];
+      }
+#else
       int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2)
                         ? AOMMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2)
                         : INT_MAX;
@@ -5278,8 +9819,6 @@
       for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
         nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
         nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
-        lower_mv_precision(&nearmv[0].as_mv, allow_hp);
-        lower_mv_precision(&nearmv[1].as_mv, allow_hp);
 
         if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
             nearmv[1].as_int == best_mbmode.mv[1].as_int) {
@@ -5287,46 +9826,100 @@
           best_mbmode.ref_mv_idx = i;
         }
       }
-
+#endif
       if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
         nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
         nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
-      }
-
-      for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-        lower_mv_precision(&nearestmv[i].as_mv, allow_hp);
-        lower_mv_precision(&nearmv[i].as_mv, allow_hp);
+      } else {
+        nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
+        nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
       }
 
       if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
           nearestmv[1].as_int == best_mbmode.mv[1].as_int)
-        best_mbmode.mode = NEARESTMV;
+#if CONFIG_EXT_INTER
+        best_mbmode.mode = NEAREST_NEARESTMV;
+      else if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+               nearmv[1].as_int == best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARMV;
+      else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+               nearestmv[1].as_int == best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARESTMV;
+      else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+               nearmv[1].as_int == best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARMV;
       else if (best_mbmode.mv[0].as_int == 0 && best_mbmode.mv[1].as_int == 0)
+        best_mbmode.mode = ZERO_ZEROMV;
+#else
+        best_mbmode.mode = NEARESTMV;
+      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+               best_mbmode.mv[1].as_int == zeromv[1].as_int)
         best_mbmode.mode = ZEROMV;
+#endif  // CONFIG_EXT_INTER
     }
 #else
-    if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
-        ((comp_pred_mode &&
-          frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
-         !comp_pred_mode))
-      best_mbmode.mode = NEARESTMV;
-    else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
-             ((comp_pred_mode &&
-               frame_mv[NEARMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
-              !comp_pred_mode))
-      best_mbmode.mode = NEARMV;
-    else if (best_mbmode.mv[0].as_int == 0 &&
-             ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) ||
-              !comp_pred_mode))
-      best_mbmode.mode = ZEROMV;
+#if CONFIG_EXT_INTER
+    if (!comp_pred_mode) {
+#endif  // CONFIG_EXT_INTER
+      if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+          ((comp_pred_mode &&
+            frame_mv[NEARESTMV][refs[1]].as_int == best_mbmode.mv[1].as_int) ||
+           !comp_pred_mode))
+        best_mbmode.mode = NEARESTMV;
+      else if (frame_mv[NEARMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
+               ((comp_pred_mode &&
+                 frame_mv[NEARMV][refs[1]].as_int ==
+                     best_mbmode.mv[1].as_int) ||
+                !comp_pred_mode))
+        best_mbmode.mode = NEARMV;
+      else if (best_mbmode.mv[0].as_int == zeromv[0].as_int &&
+               ((comp_pred_mode &&
+                 best_mbmode.mv[1].as_int == zeromv[1].as_int) ||
+                !comp_pred_mode))
+        best_mbmode.mode = ZEROMV;
+#if CONFIG_EXT_INTER
+    } else {
+      if (frame_mv[NEAREST_NEARESTMV][refs[0]].as_int ==
+              best_mbmode.mv[0].as_int &&
+          frame_mv[NEAREST_NEARESTMV][refs[1]].as_int ==
+              best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARESTMV;
+      else if (frame_mv[NEAREST_NEARMV][refs[0]].as_int ==
+                   best_mbmode.mv[0].as_int &&
+               frame_mv[NEAREST_NEARMV][refs[1]].as_int ==
+                   best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARMV;
+      else if (frame_mv[NEAR_NEARESTMV][refs[0]].as_int ==
+                   best_mbmode.mv[0].as_int &&
+               frame_mv[NEAR_NEARESTMV][refs[1]].as_int ==
+                   best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARESTMV;
+      else if (frame_mv[NEAR_NEARMV][refs[0]].as_int ==
+                   best_mbmode.mv[0].as_int &&
+               frame_mv[NEAR_NEARMV][refs[1]].as_int ==
+                   best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARMV;
+      else if (best_mbmode.mv[0].as_int == 0 && best_mbmode.mv[1].as_int == 0)
+        best_mbmode.mode = ZERO_ZEROMV;
+    }
+#endif  // CONFIG_EXT_INTER
 #endif
   }
 
 #if CONFIG_REF_MV
   if (best_mbmode.ref_frame[0] > INTRA_FRAME && best_mbmode.mv[0].as_int == 0 &&
-      (best_mbmode.ref_frame[1] == NONE || best_mbmode.mv[1].as_int == 0)) {
-    int8_t ref_frame_type = av1_ref_frame_type(best_mbmode.ref_frame);
-    int16_t mode_ctx = mbmi_ext->mode_context[ref_frame_type];
+#if CONFIG_EXT_INTER
+      (best_mbmode.ref_frame[1] <= INTRA_FRAME)
+#else
+      (best_mbmode.ref_frame[1] == NONE || best_mbmode.mv[1].as_int == 0)
+#endif  // CONFIG_EXT_INTER
+          ) {
+    int16_t mode_ctx = mbmi_ext->mode_context[best_mbmode.ref_frame[0]];
+#if !CONFIG_EXT_INTER
+    if (best_mbmode.ref_frame[1] > NONE)
+      mode_ctx &= (mbmi_ext->mode_context[best_mbmode.ref_frame[1]] | 0x00ff);
+#endif  // !CONFIG_EXT_INTER
+
     if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) best_mbmode.mode = ZEROMV;
   }
 #endif
@@ -5337,12 +9930,29 @@
     return;
   }
 
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         !is_inter_block(&best_mbmode));
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[1]) ||
+         !is_inter_block(&best_mbmode));
+  if (best_mbmode.ref_frame[1] > INTRA_FRAME) {
+    assert((cm->interp_filter == SWITCHABLE) ||
+           (cm->interp_filter == best_mbmode.interp_filter[2]) ||
+           !is_inter_block(&best_mbmode));
+    assert((cm->interp_filter == SWITCHABLE) ||
+           (cm->interp_filter == best_mbmode.interp_filter[3]) ||
+           !is_inter_block(&best_mbmode));
+  }
+#else
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
+#endif
 
   if (!cpi->rc.is_src_frame_alt_ref)
-    av1_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+    av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
                               sf->adaptive_rd_thresh, bsize, best_mode_index);
 
   // macroblock modes
@@ -5371,6 +9981,7 @@
 
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
                        best_mode_skippable);
+
 #if CONFIG_PALETTE
   if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
     restore_uv_color_map(cpi, x);
@@ -5390,7 +10001,8 @@
   const int comp_pred = 0;
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
-  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
   aom_prob comp_mode_p;
   InterpFilter best_filter = SWITCHABLE;
   int64_t this_rd = INT64_MAX;
@@ -5399,27 +10011,36 @@
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
-#if CONFIG_PALETTE
-  mbmi->palette_mode_info.palette_size[0] = 0;
-  mbmi->palette_mode_info.palette_size[1] = 0;
-#endif  // CONFIG_PALETTE
 
-  for (i = 0; i < MAX_REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
-  for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+  for (i = 0; i < TOTAL_REFS_PER_FRAME; ++i) x->pred_sse[i] = INT_MAX;
+  for (i = LAST_FRAME; i < TOTAL_REFS_PER_FRAME; ++i)
+    x->pred_mv_sad[i] = INT_MAX;
 
   rd_cost->rate = INT_MAX;
 
   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
+#if CONFIG_PALETTE
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#endif  // CONFIG_PALETTE
+
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
   mbmi->mode = ZEROMV;
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
   mbmi->uv_mode = DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
   mbmi->ref_frame[1] = NONE;
+#if CONFIG_GLOBAL_MOTION
+  mbmi->mv[0].as_int =
+      gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]]).as_int;
+#else   // CONFIG_GLOBAL_MOTION
   mbmi->mv[0].as_int = 0;
+#endif  // CONFIG_GLOBAL_MOTION
   mbmi->tx_size = max_txsize_lookup[bsize];
-#if CONFIG_MOTION_VAR
-  mbmi->motion_mode = SIMPLE_TRANSLATION;
-#endif  // CONFIG_MOTION_VAR
   x->skip = 1;
 
 #if CONFIG_REF_MV
@@ -5428,31 +10049,47 @@
 #endif
 
   if (cm->interp_filter != BILINEAR) {
-    best_filter = EIGHTTAP;
-    if (cm->interp_filter == SWITCHABLE) {
+    best_filter = EIGHTTAP_REGULAR;
+    if (cm->interp_filter == SWITCHABLE &&
 #if CONFIG_EXT_INTERP
-      if (is_interp_needed(xd))
+        av1_is_interp_needed(xd) &&
+#endif  // CONFIG_EXT_INTERP
+        x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+      int rs;
+      int best_rs = INT_MAX;
+      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+#if CONFIG_DUAL_FILTER
+        int k;
+        for (k = 0; k < 4; ++k) mbmi->interp_filter[k] = i;
+#else
+        mbmi->interp_filter = i;
 #endif
-      {
-        int rs;
-        int best_rs = INT_MAX;
-        for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-          mbmi->interp_filter = i;
-          rs = av1_get_switchable_rate(cpi, xd);
-          if (rs < best_rs) {
-            best_rs = rs;
-            best_filter = mbmi->interp_filter;
-          }
+        rs = av1_get_switchable_rate(cpi, xd);
+        if (rs < best_rs) {
+          best_rs = rs;
+#if CONFIG_DUAL_FILTER
+          best_filter = mbmi->interp_filter[0];
+#else
+          best_filter = mbmi->interp_filter;
+#endif
         }
       }
     }
   }
   // Set the appropriate filter
   if (cm->interp_filter == SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i) mbmi->interp_filter[i] = best_filter;
+#else
     mbmi->interp_filter = best_filter;
+#endif
     rate2 += av1_get_switchable_rate(cpi, xd);
   } else {
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i) mbmi->interp_filter[0] = cm->interp_filter;
+#else
     mbmi->interp_filter = cm->interp_filter;
+#endif
   }
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -5473,10 +10110,15 @@
     return;
   }
 
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mbmi->interp_filter[0]));
+#else
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
+#endif
 
-  av1_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+  av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   av1_zero(best_pred_diff);
@@ -5484,10 +10126,14 @@
   store_coding_context(x, ctx, THR_ZEROMV, best_pred_diff, 0);
 }
 
-void av1_rd_pick_inter_mode_sub8x8(const AV1_COMP *cpi, TileDataEnc *tile_data,
-                                   MACROBLOCK *x, int mi_row, int mi_col,
-                                   RD_COST *rd_cost, BLOCK_SIZE bsize,
-                                   PICK_MODE_CONTEXT *ctx,
+void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
+                                   TileDataEnc *tile_data, struct macroblock *x,
+                                   int mi_row, int mi_col,
+                                   struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                   int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far) {
   const AV1_COMMON *const cm = &cpi->common;
   const RD_OPT *const rd_opt = &cpi->rd;
@@ -5498,9 +10144,9 @@
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
-  int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
-  static const int flag_list[REFS_PER_FRAME + 1] = {
+  int_mv frame_mv[MB_MODE_COUNT][TOTAL_REFS_PER_FRAME];
+  struct buf_2d yv12_mb[TOTAL_REFS_PER_FRAME][MAX_MB_PLANE];
+  static const int flag_list[TOTAL_REFS_PER_FRAME] = {
     0,
     AOM_LAST_FLAG,
 #if CONFIG_EXT_REFS
@@ -5517,19 +10163,27 @@
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
-  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
-  unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
+  unsigned int ref_costs_single[TOTAL_REFS_PER_FRAME];
+  unsigned int ref_costs_comp[TOTAL_REFS_PER_FRAME];
   aom_prob comp_mode_p;
+#if CONFIG_DUAL_FILTER
+  InterpFilter tmp_best_filter[4] = { 0 };
+#else
   InterpFilter tmp_best_filter = SWITCHABLE;
+#endif
   int rate_uv_intra, rate_uv_tokenonly = INT_MAX;
   int64_t dist_uv = INT64_MAX;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
   const int intra_cost_penalty = av1_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
-  int_mv seg_mvs[4][MAX_REF_FRAMES];
+#if CONFIG_EXT_INTER
+  int_mv seg_mvs[4][2][TOTAL_REFS_PER_FRAME];
+#else
+  int_mv seg_mvs[4][TOTAL_REFS_PER_FRAME];
+#endif  // CONFIG_EXT_INTER
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
@@ -5541,25 +10195,53 @@
   od_encode_checkpoint(&x->daala_enc, &pre_buf);
 #endif
 
+#if CONFIG_SUPERTX
+  best_rd_so_far = INT64_MAX;
+  best_rd = best_rd_so_far;
+  best_yrd = best_rd_so_far;
+#endif  // CONFIG_SUPERTX
   av1_zero(best_mbmode);
 
+#if CONFIG_FILTER_INTRA
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[0] = 0;
+  mbmi->filter_intra_mode_info.use_filter_intra_mode[1] = 0;
+#endif  // CONFIG_FILTER_INTRA
+  mbmi->motion_mode = SIMPLE_TRANSLATION;
+#if CONFIG_EXT_INTER
+  mbmi->use_wedge_interinter = 0;
+  mbmi->use_wedge_interintra = 0;
+#endif  // CONFIG_EXT_INTER
+
   for (i = 0; i < 4; i++) {
     int j;
-    for (j = 0; j < MAX_REF_FRAMES; j++) seg_mvs[i][j].as_int = INVALID_MV;
+#if CONFIG_EXT_INTER
+    int k;
+
+    for (k = 0; k < 2; k++)
+      for (j = 0; j < TOTAL_REFS_PER_FRAME; j++)
+        seg_mvs[i][k][j].as_int = INVALID_MV;
+#else
+    for (j = 0; j < TOTAL_REFS_PER_FRAME; j++)
+      seg_mvs[i][j].as_int = INVALID_MV;
+#endif  // CONFIG_EXT_INTER
   }
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
   for (i = 0; i < REFERENCE_MODES; ++i) best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    best_filter_rd[i] = INT64_MAX;
   rate_uv_intra = INT_MAX;
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
@@ -5568,6 +10250,9 @@
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_EXT_INTER
+    frame_mv[NEWFROMNEARMV][ref_frame].as_int = INVALID_MV;
+#endif  // CONFIG_EXT_INTER
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
@@ -5656,7 +10341,9 @@
 #endif  // CONFIG_EXT_REFS
             break;
           case NONE:
-          case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
+          case TOTAL_REFS_PER_FRAME:
+            assert(0 && "Invalid Reference frame");
+            break;
         }
       }
     }
@@ -5721,13 +10408,17 @@
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
-    // Evaluate all sub-pel filters irrespective of whether we can use
-    // them for this frame.
+// Evaluate all sub-pel filters irrespective of whether we can use
+// them for this frame.
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i)
+      mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE
+                                   ? EIGHTTAP_REGULAR
+                                   : cm->interp_filter;
+#else
     mbmi->interp_filter =
-        cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
-#if CONFIG_MOTION_VAR
-    mbmi->motion_mode = SIMPLE_TRANSLATION;
-#endif  // CONFIG_MOTION_VAR
+        cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR : cm->interp_filter;
+#endif
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
@@ -5737,20 +10428,22 @@
       if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
+#if CONFIG_VAR_TX
+    mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+    mbmi->min_tx_size = get_min_tx_size(mbmi->tx_size);
+#endif
+
     if (ref_frame == INTRA_FRAME) {
       int rate;
-#if CONFIG_EXT_INTRA
-      mbmi->intra_angle_delta[0] = 0;
-#endif  // CONFIG_EXT_INTRA
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y, &distortion_y,
-                                       best_rd) >= best_rd)
+                                       NULL, best_rd) >= best_rd)
         continue;
       rate2 += rate;
       rate2 += intra_cost_penalty;
       distortion2 += distortion_y;
 
       if (rate_uv_intra == INT_MAX) {
-        choose_intra_uv_mode(cpi, x, bsize, TX_4X4, &rate_uv_intra,
+        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4, &rate_uv_intra,
                              &rate_uv_tokenonly, &dist_uv, &skip_uv, &mode_uv);
       }
       rate2 += rate_uv_intra;
@@ -5769,11 +10462,26 @@
       int switchable_filter_index;
       int_mv *second_ref =
           comp_pred ? &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
-      b_mode_info tmp_best_bmodes[16];
+      b_mode_info tmp_best_bmodes[16];  // Should this be 4 ?
       MB_MODE_INFO tmp_best_mbmode;
+#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+      BEST_SEG_INFO bsi[25];
+#else
+      BEST_SEG_INFO bsi[9];
+#endif
+#else
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+#endif
       int pred_exists = 0;
       int uv_skippable;
+#if CONFIG_EXT_INTER
+      int_mv compound_seg_newmvs[4][2];
+      for (i = 0; i < 4; i++) {
+        compound_seg_newmvs[i][0].as_int = INVALID_MV;
+        compound_seg_newmvs[i][1].as_int = INVALID_MV;
+      }
+#endif  // CONFIG_EXT_INTER
 
       this_rd_thresh = (ref_frame == LAST_FRAME)
                            ? rd_opt->threshes[segment_id][bsize][THR_LAST]
@@ -5785,53 +10493,120 @@
       this_rd_thresh = (ref_frame == LAST3_FRAME)
                            ? rd_opt->threshes[segment_id][bsize][THR_LAST3]
                            : this_rd_thresh;
+      this_rd_thresh = (ref_frame == BWDREF_FRAME)
+                           ? rd_opt->threshes[segment_id][bsize][THR_BWDR]
+                           : this_rd_thresh;
 #endif  // CONFIG_EXT_REFS
       this_rd_thresh = (ref_frame == GOLDEN_FRAME)
                            ? rd_opt->threshes[segment_id][bsize][THR_GOLD]
                            : this_rd_thresh;
-#if CONFIG_EXT_REFS
-// TODO(zoeliu): To explore whether this_rd_thresh should consider
-//               BWDREF_FRAME and ALTREF_FRAME
-#endif  // CONFIG_EXT_REFS
+
+      // TODO(any): Add search of the tx_type to improve rd performance at the
+      // expense of speed.
+      mbmi->tx_type = DCT_DCT;
 
       if (cm->interp_filter != BILINEAR) {
-        tmp_best_filter = EIGHTTAP;
+#if CONFIG_DUAL_FILTER
+        tmp_best_filter[0] = EIGHTTAP_REGULAR;
+        tmp_best_filter[1] = EIGHTTAP_REGULAR;
+        tmp_best_filter[2] = EIGHTTAP_REGULAR;
+        tmp_best_filter[3] = EIGHTTAP_REGULAR;
+#else
+        tmp_best_filter = EIGHTTAP_REGULAR;
+#endif
         if (x->source_variance < sf->disable_filter_search_var_thresh) {
-          tmp_best_filter = EIGHTTAP;
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = EIGHTTAP_REGULAR;
+#else
+          tmp_best_filter = EIGHTTAP_REGULAR;
+#endif
         } else if (sf->adaptive_pred_interp_filter == 1 &&
                    ctx->pred_interp_filter < SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter;
+#else
           tmp_best_filter = ctx->pred_interp_filter;
+#endif
         } else if (sf->adaptive_pred_interp_filter == 2) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter < SWITCHABLE
+                                   ? ctx->pred_interp_filter
+                                   : 0;
+#else
           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE
                                 ? ctx->pred_interp_filter
                                 : 0;
+#endif
         } else {
+#if CONFIG_DUAL_FILTER
+          for (switchable_filter_index = 0;
+#if CONFIG_EXT_INTERP
+               switchable_filter_index < 25;
+#else
+               switchable_filter_index < 9;
+#endif
+               ++switchable_filter_index) {
+#else
           for (switchable_filter_index = 0;
                switchable_filter_index < SWITCHABLE_FILTERS;
                ++switchable_filter_index) {
+#endif
             int newbest, rs;
             int64_t rs_rd;
             MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+#if CONFIG_DUAL_FILTER
+            mbmi->interp_filter[0] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[1] = filter_sets[switchable_filter_index][1];
+            mbmi->interp_filter[2] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[3] = filter_sets[switchable_filter_index][1];
+#else
             mbmi->interp_filter = switchable_filter_index;
+#endif
             tmp_rd = rd_pick_best_sub8x8_mode(
                 cpi, x, &mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
                 &rate, &rate_y, &distortion, &skippable, &total_sse,
-                (int)this_rd_thresh, seg_mvs, bsi, switchable_filter_index,
-                mi_row, mi_col);
-
+                (int)this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+                compound_seg_newmvs,
+#endif  // CONFIG_EXT_INTER
+                bsi, switchable_filter_index, mi_row, mi_col);
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+            if (!av1_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+                (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+                 mbmi->interp_filter[1] != EIGHTTAP_REGULAR))  // invalid config
+              continue;
+#else
+            if (!av1_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+                mbmi->interp_filter != EIGHTTAP_REGULAR)  // invalid config
+              continue;
+#endif
+#endif  // CONFIG_EXT_INTERP
             if (tmp_rd == INT64_MAX) continue;
             rs = av1_get_switchable_rate(cpi, xd);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            tmp_rd += rs_rd;
+            if (cm->interp_filter == SWITCHABLE) tmp_rd += rs_rd;
 
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
+#if CONFIG_DUAL_FILTER
+              tmp_best_filter[0] = mbmi->interp_filter[0];
+              tmp_best_filter[1] = mbmi->interp_filter[1];
+              tmp_best_filter[2] = mbmi->interp_filter[2];
+              tmp_best_filter[3] = mbmi->interp_filter[3];
+#else
               tmp_best_filter = mbmi->interp_filter;
+#endif
               tmp_best_rd = tmp_rd;
             }
             if ((newbest && cm->interp_filter == SWITCHABLE) ||
-                (mbmi->interp_filter == cm->interp_filter &&
-                 cm->interp_filter != SWITCHABLE)) {
+                (
+#if CONFIG_DUAL_FILTER
+                    mbmi->interp_filter[0] == cm->interp_filter
+#else
+                    mbmi->interp_filter == cm->interp_filter
+#endif
+                    && cm->interp_filter != SWITCHABLE)) {
               tmp_best_rdu = tmp_rd;
               tmp_best_rate = rate;
               tmp_best_ratey = rate_y;
@@ -5843,16 +10618,6 @@
                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
               }
               pred_exists = 1;
-              if (switchable_filter_index == 0 && sf->use_rd_breakout &&
-                  best_rd < INT64_MAX) {
-                if (tmp_best_rdu / 2 > best_rd) {
-                  // skip searching the other filters if the first is
-                  // already substantially larger than the best so far
-                  tmp_best_filter = mbmi->interp_filter;
-                  tmp_best_rdu = INT64_MAX;
-                  break;
-                }
-              }
             }
           }  // switchable_filter_index loop
         }
@@ -5860,16 +10625,50 @@
 
       if (tmp_best_rdu == INT64_MAX && pred_exists) continue;
 
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[0]
+                                           : cm->interp_filter);
+      mbmi->interp_filter[1] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[1]
+                                           : cm->interp_filter);
+      mbmi->interp_filter[2] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[2]
+                                           : cm->interp_filter);
+      mbmi->interp_filter[3] =
+          (cm->interp_filter == SWITCHABLE ? tmp_best_filter[3]
+                                           : cm->interp_filter);
+#else
       mbmi->interp_filter =
           (cm->interp_filter == SWITCHABLE ? tmp_best_filter
                                            : cm->interp_filter);
+#endif
+
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        // switchable list (bilinear) is indicated at the frame level
         tmp_rd = rd_pick_best_sub8x8_mode(
             cpi, x, &x->mbmi_ext->ref_mvs[ref_frame][0], second_ref, best_yrd,
             &rate, &rate_y, &distortion, &skippable, &total_sse,
-            (int)this_rd_thresh, seg_mvs, bsi, 0, mi_row, mi_col);
+            (int)this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+            compound_seg_newmvs,
+#endif  // CONFIG_EXT_INTER
+            bsi, 0, mi_row, mi_col);
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+        if (!av1_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+            (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+             mbmi->interp_filter[1] != EIGHTTAP_REGULAR)) {
+          mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+          mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+        }
+#else
+        if (!av1_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+            mbmi->interp_filter != EIGHTTAP_REGULAR)
+          mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
         if (tmp_rd == INT64_MAX) continue;
       } else {
         total_sse = tmp_best_sse;
@@ -5880,16 +10679,29 @@
         *mbmi = tmp_best_mbmode;
         for (i = 0; i < 4; i++) xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
       }
-
-#if CONFIG_EXT_INTERP
-      if (cm->interp_filter == SWITCHABLE && !is_interp_needed(xd))
-        mbmi->interp_filter = EIGHTTAP;
+      // Add in the cost of the transform type
+      if (!xd->lossless[mbmi->segment_id]) {
+        int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+        if (get_ext_tx_types(mbmi->tx_size, bsize, 1) > 1) {
+          const int eset = get_ext_tx_set(mbmi->tx_size, bsize, 1);
+          rate_tx_type =
+              cpi->inter_tx_type_costs[eset][mbmi->tx_size][mbmi->tx_type];
+        }
+#else
+        if (mbmi->tx_size < TX_32X32) {
+          rate_tx_type = cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+        }
 #endif
+        rate += rate_tx_type;
+        rate_y += rate_tx_type;
+      }
 
       rate2 += rate;
       distortion2 += distortion;
 
-      rate2 += av1_get_switchable_rate(cpi, xd);
+      if (cm->interp_filter == SWITCHABLE)
+        rate2 += av1_get_switchable_rate(cpi, xd);
 
       if (!mode_excluded)
         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -5904,15 +10716,30 @@
       if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
         // then dont bother looking at UV
+        int is_cost_valid_uv;
+#if CONFIG_VAR_TX
+        RD_STATS rd_stats_uv;
+#endif
         av1_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
-        if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
-                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
-          continue;
-
+#if CONFIG_VAR_TX
+        is_cost_valid_uv =
+            inter_block_uvrd(cpi, x, &rd_stats_uv, BLOCK_8X8, tmp_best_rdu);
+        rate_uv = rd_stats_uv.rate;
+        distortion_uv = rd_stats_uv.dist;
+        uv_skippable = rd_stats_uv.skip;
+        uv_sse = rd_stats_uv.sse;
+#else
+        is_cost_valid_uv =
+            super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                             &uv_sse, BLOCK_8X8, tmp_best_rdu);
+#endif
+        if (!is_cost_valid_uv) continue;
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
         total_sse += uv_sse;
+      } else {
+        continue;
       }
     }
 
@@ -5922,6 +10749,9 @@
     // to the rolling cost variable.
     if (second_ref_frame > INTRA_FRAME) {
       rate2 += ref_costs_comp[ref_frame];
+#if CONFIG_EXT_REFS
+      rate2 += ref_costs_comp[second_ref_frame];
+#endif  // CONFIG_EXT_REFS
     } else {
       rate2 += ref_costs_single[ref_frame];
     }
@@ -5957,8 +10787,6 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = AOMMIN(best_pred_rd[i], this_rd);
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = AOMMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
@@ -5973,6 +10801,15 @@
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip)
+          *returnrate_nocoef -=
+              av1_cost_bit(av1_get_skip_prob(cm, xd), this_skip2);
+        *returnrate_nocoef -= av1_cost_bit(av1_get_intra_inter_prob(cm, xd),
+                                           mbmi->ref_frame[0] != INTRA_FRAME);
+        assert(*returnrate_nocoef > 0);
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
@@ -5981,6 +10818,11 @@
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
 
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memset(ctx->blk_skip[i], 0, sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+
         for (i = 0; i < 4; i++) best_bmodes[i] = xd->mi[0]->bmi[i];
       }
     }
@@ -6015,6 +10857,9 @@
   if (best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }
 
@@ -6022,18 +10867,31 @@
     rd_cost->rate = INT_MAX;
     rd_cost->dist = INT64_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }
 
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         !is_inter_block(&best_mbmode));
+#else
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
+#endif
 
-  av1_update_rd_thresh_fact(tile_data->thresh_freq_fact, sf->adaptive_rd_thresh,
-                            bsize, best_ref_index);
+  av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                            sf->adaptive_rd_thresh, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
+#if CONFIG_VAR_TX && CONFIG_EXT_TX && CONFIG_RECT_TX
+  mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+#endif  // CONFIG_EXT_TX && CONFIG_RECT_TX
+
   x->skip |= best_skip2;
   if (!is_inter_block(&best_mbmode)) {
     for (i = 0; i < 4; i++) xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
@@ -6092,7 +10950,7 @@
 //  mask(x, y) = Mh(x) * Mv(y)
 //
 // These can then be used to efficiently approximate the error for any
-// predictor P in the context of the provided neighboring predictors by
+// predictor P in the context of the provided neighbouring predictors by
 // computing:
 //
 //  error(x, y) =

diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index a8cd000..678c0db 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h

@@ -26,15 +26,85 @@
 struct macroblock;
 struct RD_COST;
 
+#if CONFIG_VAR_TX
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = 0;
+  rd_stats->dist = 0;
+  rd_stats->sse = 0;
+  rd_stats->skip = 1;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    int r, c;
+    rd_stats->txb_coeff_cost[plane] = 0;
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+        rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
+  }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats->rate = INT_MAX;
+  rd_stats->dist = INT64_MAX;
+  rd_stats->sse = INT64_MAX;
+  rd_stats->skip = 0;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    int r, c;
+    rd_stats->txb_coeff_cost[plane] = INT_MAX;
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+        rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
+  }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+                                      const RD_STATS *rd_stats_src) {
+#if CONFIG_RD_DEBUG
+  int plane;
+#endif
+  rd_stats_dst->rate += rd_stats_src->rate;
+  rd_stats_dst->dist += rd_stats_src->dist;
+  rd_stats_dst->sse += rd_stats_src->sse;
+  rd_stats_dst->skip &= rd_stats_src->skip;
+#if CONFIG_RD_DEBUG
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    int r, c;
+    int ref_txb_coeff_cost = 0;
+    rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+    // TODO(angiebird): optimize this part
+    for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+      for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+        rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
+            rd_stats_src->txb_coeff_cost_map[plane][r][c];
+        ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
+      }
+    assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
+  }
+#endif
+}
+#endif
+
+int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x, int plane,
+                    int block, int coeff_ctx, TX_SIZE tx_size,
+                    const int16_t *scan, const int16_t *nb,
+                    int use_fast_coef_costing);
 void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs);
 #if CONFIG_AOM_HIGHBITDEPTH
-unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);
 #endif
@@ -42,8 +112,12 @@
 void av1_rd_pick_inter_mode_sb(const struct AV1_COMP *cpi,
                                struct TileDataEnc *tile_data,
                                struct macroblock *x, int mi_row, int mi_col,
-                               struct RD_COST *rd_cost, BLOCK_SIZE bsize,
-                               PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+                               struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                               int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                               int64_t best_rd_so_far);
 
 void av1_rd_pick_inter_mode_sb_seg_skip(
     const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
@@ -58,10 +132,27 @@
 void av1_rd_pick_inter_mode_sub8x8(const struct AV1_COMP *cpi,
                                    struct TileDataEnc *tile_data,
                                    struct macroblock *x, int mi_row, int mi_col,
-                                   struct RD_COST *rd_cost, BLOCK_SIZE bsize,
-                                   PICK_MODE_CONTEXT *ctx,
+                                   struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                   int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far);
 
+#if CONFIG_SUPERTX
+#if CONFIG_VAR_TX
+void av1_tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                       int blk_row, int blk_col, int plane, int block,
+                       int plane_bsize, int coeff_ctx, RD_STATS *rd_stats);
+#endif
+
+void av1_txfm_rd_in_plane_supertx(MACROBLOCK *x, const AV1_COMP *cpi, int *rate,
+                                  int64_t *distortion, int *skippable,
+                                  int64_t *sse, int64_t ref_best_rd, int plane,
+                                  BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                  int use_fast_coef_casting);
+#endif  // CONFIG_SUPERTX
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/resize.c b/av1/encoder/resize.c
index c380976..ed416ed 100644
--- a/av1/encoder/resize.c
+++ b/av1/encoder/resize.c

@@ -365,7 +365,7 @@
 }
 
 static void resize_multistep(const uint8_t *const input, int length,
-                             uint8_t *output, int olength, uint8_t *buf) {
+                             uint8_t *output, int olength, uint8_t *otmp) {
   int steps;
   if (length == olength) {
     memcpy(output, input, sizeof(output[0]) * length);
@@ -376,15 +376,10 @@
   if (steps > 0) {
     int s;
     uint8_t *out = NULL;
-    uint8_t *tmpbuf = NULL;
-    uint8_t *otmp, *otmp2;
+    uint8_t *otmp2;
     int filteredlength = length;
-    if (!tmpbuf) {
-      tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length);
-      otmp = tmpbuf;
-    } else {
-      otmp = buf;
-    }
+
+    assert(otmp != NULL);
     otmp2 = otmp + get_down2_length(length, 1);
     for (s = 0; s < steps; ++s) {
       const int proj_filteredlength = get_down2_length(filteredlength, 1);
@@ -402,7 +397,6 @@
     if (filteredlength != olength) {
       interpolate(out, filteredlength, output, olength);
     }
-    if (tmpbuf) free(tmpbuf);
   } else {
     interpolate(input, length, output, olength);
   }
@@ -433,7 +427,10 @@
   uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
   uint8_t *tmpbuf =
       (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width));
-  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height);
+  uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
   assert(width > 0);
   assert(height > 0);
   assert(width2 > 0);
@@ -443,12 +440,15 @@
                      tmpbuf);
   for (i = 0; i < width2; ++i) {
     fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf);
-    fill_arr_to_col(output + i, out_stride, height2, arrbuf + height);
+    resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+    fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
   }
+
+Error:
   free(intbuf);
   free(tmpbuf);
   free(arrbuf);
+  free(arrbuf2);
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -652,7 +652,7 @@
 
 static void highbd_resize_multistep(const uint16_t *const input, int length,
                                     uint16_t *output, int olength,
-                                    uint16_t *buf, int bd) {
+                                    uint16_t *otmp, int bd) {
   int steps;
   if (length == olength) {
     memcpy(output, input, sizeof(output[0]) * length);
@@ -663,15 +663,10 @@
   if (steps > 0) {
     int s;
     uint16_t *out = NULL;
-    uint16_t *tmpbuf = NULL;
-    uint16_t *otmp, *otmp2;
+    uint16_t *otmp2;
     int filteredlength = length;
-    if (!tmpbuf) {
-      tmpbuf = (uint16_t *)malloc(sizeof(uint16_t) * length);
-      otmp = tmpbuf;
-    } else {
-      otmp = buf;
-    }
+
+    assert(otmp != NULL);
     otmp2 = otmp + get_down2_length(length, 1);
     for (s = 0; s < steps; ++s) {
       const int proj_filteredlength = get_down2_length(filteredlength, 1);
@@ -689,7 +684,6 @@
     if (filteredlength != olength) {
       highbd_interpolate(out, filteredlength, output, olength, bd);
     }
-    if (tmpbuf) free(tmpbuf);
   } else {
     highbd_interpolate(input, length, output, olength, bd);
   }
@@ -722,21 +716,26 @@
   uint16_t *intbuf = (uint16_t *)malloc(sizeof(uint16_t) * width2 * height);
   uint16_t *tmpbuf =
       (uint16_t *)malloc(sizeof(uint16_t) * (width < height ? height : width));
-  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * (height + height2));
+  uint16_t *arrbuf = (uint16_t *)malloc(sizeof(uint16_t) * height);
+  uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2);
+  if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+    goto Error;
   for (i = 0; i < height; ++i) {
     highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
                             intbuf + width2 * i, width2, tmpbuf, bd);
   }
   for (i = 0; i < width2; ++i) {
     highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
-    highbd_resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf,
-                            bd);
+    highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd);
     highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
-                           arrbuf + height);
+                           arrbuf2);
   }
+
+Error:
   free(intbuf);
   free(tmpbuf);
   free(arrbuf);
+  free(arrbuf2);
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 

diff --git a/av1/encoder/segmentation.c b/av1/encoder/segmentation.c
index f7c4ff2..828b31c 100644
--- a/av1/encoder/segmentation.c
+++ b/av1/encoder/segmentation.c

@@ -160,11 +160,98 @@
                           unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
   const int mis = cm->mi_stride;
-  int bw, bh;
   const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#else
+  int bw, bh;
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
 
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize == BLOCK_8X8)
+    partition = PARTITION_NONE;
+  else
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  switch (partition) {
+    case PARTITION_NONE:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+                 mi_col + hbs);
+      break;
+    case PARTITION_HORZ_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, bs, mi_row,
+                 mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT: {
+      const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+      int n;
+
+      assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs &&
+             num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs);
+
+      for (n = 0; n < 4; n++) {
+        const int mi_dc = hbs * (n & 1);
+        const int mi_dr = hbs * (n >> 1);
+
+        count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+                      temporal_predictor_count, t_unpred_seg_counts,
+                      mi_row + mi_dr, mi_col + mi_dc, subsize);
+      }
+    } break;
+    default: assert(0);
+  }
+#else
   bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
   bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
 
@@ -198,45 +285,56 @@
                     mi_row + mi_dr, mi_col + mi_dc, subsize);
     }
   }
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
   struct segmentation *seg = &cm->seg;
   struct segmentation_probs *segp = &cm->fc->seg;
+
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
 
-  int i, tile_col, mi_row, mi_col;
+  int i, tile_col, tile_row, mi_row, mi_col;
 #if CONFIG_TILE_GROUPS
   const int probwt = cm->num_tg;
 #else
   const int probwt = 1;
 #endif
+
   unsigned(*temporal_predictor_count)[2] = cm->counts.seg.pred;
   unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
   unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
+
   aom_prob no_pred_tree[SEG_TREE_PROBS];
   aom_prob t_pred_tree[SEG_TREE_PROBS];
   aom_prob t_nopred_prob[PREDICTION_PROBS];
+
   (void)xd;
+
+  // We are about to recompute all the segment counts, so zero the accumulators.
   av1_zero(cm->counts.seg);
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
-  for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
-    TileInfo tile;
-    MODE_INFO **mi_ptr;
-    av1_tile_init(&tile, cm, 0, tile_col);
-
-    mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
-    for (mi_row = 0; mi_row < cm->mi_rows;
-         mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
-      MODE_INFO **mi = mi_ptr;
-      for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
-           mi_col += 8, mi += 8)
-        count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
-                      temporal_predictor_count, t_unpred_seg_counts, mi_row,
-                      mi_col, BLOCK_64X64);
+  for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+    TileInfo tile_info;
+    av1_tile_set_row(&tile_info, cm, tile_row);
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      MODE_INFO **mi_ptr;
+      av1_tile_set_col(&tile_info, cm, tile_col);
+      mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
+               tile_info.mi_col_start;
+      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) {
+        MODE_INFO **mi = mi_ptr;
+        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->mib_size, mi += cm->mib_size) {
+          count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+                        temporal_predictor_count, t_unpred_seg_counts, mi_row,
+                        mi_col, cm->sb_size);
+        }
+      }
     }
   }
 

diff --git a/av1/encoder/speed_features.c b/av1/encoder/speed_features.c
index 75c8508..2fb651c 100644
--- a/av1/encoder/speed_features.c
+++ b/av1/encoder/speed_features.c

@@ -138,9 +138,12 @@
                                    SPEED_FEATURES *sf, int speed) {
   const int boosted = frame_is_boosted(cpi);
 
-  sf->adaptive_rd_thresh = 1;
-
   if (speed >= 1) {
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+  }
+
+  if (speed >= 2) {
     if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
         av1_internal_image_edge(cpi)) {
       sf->use_square_partition_only = !frame_is_boosted(cpi);
@@ -153,7 +156,7 @@
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
     sf->mv.auto_mv_step_size = 1;
-    sf->adaptive_rd_thresh = 2;
+    sf->adaptive_rd_thresh = 1;
     sf->mv.subpel_iters_per_step = 1;
     sf->mode_skip_start = 10;
     sf->adaptive_pred_interp_filter = 1;
@@ -166,12 +169,19 @@
 
     sf->tx_size_search_breakout = 1;
     sf->partition_search_breakout_rate_thr = 80;
+    sf->tx_type_search.prune_mode = PRUNE_ONE;
+    // Use transform domain distortion.
+    // Note var-tx expt always uses pixel domain distortion.
+    sf->use_transform_domain_distortion = 1;
+#if CONFIG_EXT_INTER
+    sf->disable_wedge_search_var_thresh = 100;
+    sf->fast_wedge_sign_estimate = 1;
+#endif  // CONFIG_EXT_INTER
   }
 
-  if (speed >= 2) {
+  if (speed >= 3) {
     sf->tx_size_search_method =
         frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
-
     sf->mode_search_skip_flags =
         (cm->frame_type == KEY_FRAME) ? 0 : FLAG_SKIP_INTRA_DIRMISMATCH |
                                                 FLAG_SKIP_INTRA_BESTINTER |
@@ -182,9 +192,13 @@
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->allow_partition_search_skip = 1;
     sf->use_upsampled_references = 0;
+    sf->adaptive_rd_thresh = 2;
+#if CONFIG_EXT_TX
+    sf->tx_type_search.prune_mode = PRUNE_TWO;
+#endif
   }
 
-  if (speed >= 3) {
+  if (speed >= 4) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->tx_size_search_method =
         frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
@@ -202,7 +216,7 @@
     sf->adaptive_interp_filter_search = 1;
   }
 
-  if (speed >= 4) {
+  if (speed >= 5) {
     sf->use_square_partition_only = 1;
     sf->tx_size_search_method = USE_LARGESTALL;
     sf->mv.search_method = BIGDIA;
@@ -217,7 +231,7 @@
     sf->partition_search_breakout_rate_thr = 300;
   }
 
-  if (speed >= 5) {
+  if (speed >= 6) {
     int i;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
@@ -236,7 +250,6 @@
                                                      SPEED_FEATURES *sf,
                                                      int speed) {
   AV1_COMMON *const cm = &cpi->common;
-
   if (speed >= 1) {
     if (AOMMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask =
@@ -275,6 +288,14 @@
   sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
   sf->use_upsampled_references = 0;
+#if CONFIG_EXT_INTER
+  sf->disable_wedge_search_var_thresh = 100;
+  sf->fast_wedge_sign_estimate = 1;
+#endif  // CONFIG_EXT_INTER
+
+  // Use transform domain distortion computation
+  // Note var-tx expt always uses pixel domain distortion.
+  sf->use_transform_domain_distortion = 1;
 
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -344,11 +365,15 @@
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+#if CONFIG_EXT_PARTITION
+    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
+#endif  // CONFIG_EXT_PARTITION
     sf->max_intra_bsize = BLOCK_32X32;
   }
 
   if (speed >= 5) {
-    sf->use_quant_fp = !is_keyframe;
     sf->auto_min_max_partition_size =
         is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
     sf->default_max_partition_size = BLOCK_32X32;
@@ -362,6 +387,11 @@
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+#if CONFIG_EXT_PARTITION
+    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEW_ZERO;
+#endif  // CONFIG_EXT_PARTITION
     sf->adaptive_rd_thresh = 2;
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
@@ -461,13 +491,15 @@
   sf->cb_pred_filter_search = 0;
   sf->cb_partition_search = 0;
   sf->alt_ref_search_fp = 0;
-  sf->use_quant_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
+  sf->tx_type_search.prune_mode = NO_PRUNE;
+  sf->tx_type_search.fast_intra_tx_type_search = 0;
+  sf->tx_type_search.fast_inter_tx_type_search = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
-  sf->default_max_partition_size = BLOCK_64X64;
+  sf->default_max_partition_size = BLOCK_LARGEST;
   sf->default_min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
@@ -478,7 +510,15 @@
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
+#if CONFIG_EXT_TILE
+  sf->use_upsampled_references = 0;
+#else
   sf->use_upsampled_references = 1;
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_EXT_INTER
+  sf->disable_wedge_search_var_thresh = 0;
+  sf->fast_wedge_sign_estimate = 0;
+#endif  // CONFIG_EXT_INTER
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -491,7 +531,7 @@
   sf->mode_skip_start = MAX_MODES;  // Mode index at which mode skip mask set
   sf->schedule_mode_search = 0;
   for (i = 0; i < BLOCK_SIZES; ++i) sf->inter_mode_mask[i] = INTER_ALL;
-  sf->max_intra_bsize = BLOCK_64X64;
+  sf->max_intra_bsize = BLOCK_LARGEST;
   sf->reuse_inter_pred_sby = 0;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
@@ -505,11 +545,24 @@
   sf->partition_search_breakout_rate_thr = 0;
   sf->simple_model_rd_from_var = 0;
 
+// Set this at the appropriate speed levels
+#if CONFIG_EXT_TILE
+  sf->use_transform_domain_distortion = 1;
+#else
+  sf->use_transform_domain_distortion = 0;
+#endif  // CONFIG_EXT_TILE
+
   if (oxcf->mode == REALTIME)
     set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
   else if (oxcf->mode == GOOD)
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
+  // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+  // blocks. Normalise this if the blocks are bigger.
+  if (MAX_SB_SIZE_LOG2 > 6) {
+    sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+  }
+
   cpi->full_search_sad = av1_full_search_sad;
   cpi->diamond_search_sad = av1_diamond_search_sad;
 

diff --git a/av1/encoder/speed_features.h b/av1/encoder/speed_features.h
index ae6ac9c..c6821bf 100644
--- a/av1/encoder/speed_features.h
+++ b/av1/encoder/speed_features.h

@@ -29,6 +29,43 @@
       (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) | (1 << H_PRED)
 };
 
+#if CONFIG_EXT_INTER
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV) |
+              (1 << NEWFROMNEARMV) | (1 << NEAREST_NEARESTMV) |
+              (1 << NEAR_NEARMV) | (1 << NEAREST_NEARMV) |
+              (1 << NEAR_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) |
+              (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) |
+              (1 << ZERO_ZEROMV),
+  INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
+                  (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+                  (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) | (1 << NEWFROMNEARMV) |
+                      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
+                      (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEARMV) |
+                      (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+                      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) |
+                       (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+                       (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+                       (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
+  INTER_NEAREST_NEW_ZERO =
+      (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) | (1 << NEWFROMNEARMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEW_NEWMV) |
+      (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) |
+      (1 << NEAREST_NEWMV) | (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+  INTER_NEAREST_NEAR_NEW =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) | (1 << NEWFROMNEARMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEARMV) |
+      (1 << NEAR_NEARESTMV) | (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+  INTER_NEAREST_NEAR_ZERO =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEAREST_NEARMV) |
+      (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+};
+#else
 enum {
   INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
   INTER_NEAREST = (1 << NEARESTMV),
@@ -38,6 +75,7 @@
   INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
   INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
 };
+#endif  // CONFIG_EXT_INTER
 
 enum {
   DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
@@ -128,10 +166,26 @@
 } MODE_SEARCH_SKIP_LOGIC;
 
 typedef enum {
-  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+  FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR,
   FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
-  FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
-} InterpFilter_MASK;
+  FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP,
+} INTERP_FILTER_MASK;
+
+typedef enum {
+  NO_PRUNE = 0,
+  // eliminates one tx type in vertical and horizontal direction
+  PRUNE_ONE = 1,
+#if CONFIG_EXT_TX
+  // eliminates two tx types in each direction
+  PRUNE_TWO = 2,
+#endif
+} TX_TYPE_PRUNE_MODE;
+
+typedef struct {
+  TX_TYPE_PRUNE_MODE prune_mode;
+  int fast_intra_tx_type_search;
+  int fast_inter_tx_type_search;
+} TX_TYPE_SEARCH;
 
 typedef enum {
   // Search partitions using RD criterion
@@ -246,6 +300,8 @@
 
   PARTITION_SEARCH_TYPE partition_search_type;
 
+  TX_TYPE_SEARCH tx_type_search;
+
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
@@ -256,8 +312,8 @@
   // Disable testing non square partitions. (eg 16x32)
   int use_square_partition_only;
 
-  // Sets min and max partition sizes for this 64x64 region based on the
-  // same 64x64 in last encoded frame, and the left and above neighbor.
+  // Sets min and max partition sizes for this superblock based on the
+  // same superblock in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
   // Ensures the rd based auto partition search will always
   // go down at least to the specified level.
@@ -316,9 +372,6 @@
 
   int alt_ref_search_fp;
 
-  // Fast quantization process path
-  int use_quant_fp;
-
   // Use finer quantizer in every other few frames that run variable block
   // partition type search.
   int force_frame_boost;
@@ -335,6 +388,14 @@
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
 
+#if CONFIG_EXT_INTER
+  // A source variance threshold below which wedge search is disabled
+  unsigned int disable_wedge_search_var_thresh;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
+#endif  // CONFIG_EXT_INTER
+
   // These bit masks allow you to enable or disable intra modes for each
   // transform size separately.
   int intra_y_mode_mask[TX_SIZES];
@@ -393,7 +454,7 @@
   int adaptive_interp_filter_search;
 
   // mask for skip evaluation of certain interp_filter type.
-  InterpFilter_MASK interp_filter_search_mask;
+  INTERP_FILTER_MASK interp_filter_search_mask;
 
   // Partition search early breakout thresholds.
   int64_t partition_search_breakout_dist_thr;
@@ -407,6 +468,10 @@
 
   // Do sub-pixel search in up-sampled reference frames
   int use_upsampled_references;
+
+  // Whether to compute distortion in the image domain (slower but
+  // more accurate), or in the transform domain (faster but less acurate).
+  int use_transform_domain_distortion;
 } SPEED_FEATURES;
 
 struct AV1_COMP;

diff --git a/av1/encoder/subexp.c b/av1/encoder/subexp.c
index 1944f14..81bb56d 100644
--- a/av1/encoder/subexp.c
+++ b/av1/encoder/subexp.c

@@ -176,6 +176,83 @@
   return bestsavings;
 }
 
+#if CONFIG_ENTROPY
+static int get_cost(unsigned int ct[][2], aom_prob p, int n) {
+  int i, p0 = p;
+  unsigned int total_ct[2] = { 0, 0 };
+  int cost = 0;
+
+  for (i = 0; i <= n; ++i) {
+    cost += cost_branch256(ct[i], p);
+    total_ct[0] += ct[i][0];
+    total_ct[1] += ct[i][1];
+    if (i < n)
+      p = av1_merge_probs(p0, total_ct, COEF_COUNT_SAT_BITS,
+                          COEF_MAX_UPDATE_FACTOR_BITS);
+  }
+  return cost;
+}
+
+int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
+                                    aom_prob *bestp, aom_prob upd, int n) {
+  const int old_b = get_cost(ct, oldp, n);
+  int bestsavings = 0;
+  aom_prob newp, bestnewp = oldp;
+  const int step = *bestp > oldp ? -1 : 1;
+
+  for (newp = *bestp; newp != oldp; newp += step) {
+    const int new_b = get_cost(ct, newp, n);
+    const int update_b = prob_diff_update_cost(newp, oldp) + av1_cost_upd256;
+    const int savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+int av1_prob_update_search_model_subframe(unsigned int ct[ENTROPY_NODES]
+                                                         [COEF_PROBS_BUFS][2],
+                                          const aom_prob *oldp, aom_prob *bestp,
+                                          aom_prob upd, int stepsize, int n) {
+  int i, old_b, new_b, update_b, savings, bestsavings;
+  int newp;
+  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+  const int step = stepsize * step_sign;
+  aom_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  av1_model_to_full_probs(oldp, oldplist);
+  memcpy(newplist, oldp, sizeof(aom_prob) * UNCONSTRAINED_NODES);
+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+    old_b += get_cost(ct[i], oldplist[i], n);
+  old_b += get_cost(ct[PIVOT_NODE], oldplist[PIVOT_NODE], n);
+
+  bestsavings = 0;
+  bestnewp = oldp[PIVOT_NODE];
+
+  assert(stepsize > 0);
+
+  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0; newp += step) {
+    if (newp < 1 || newp > 255) continue;
+    newplist[PIVOT_NODE] = newp;
+    av1_model_to_full_probs(newplist, newplist);
+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+      new_b += get_cost(ct[i], newplist[i], n);
+    new_b += get_cost(ct[PIVOT_NODE], newplist[PIVOT_NODE], n);
+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) + av1_cost_upd256;
+    savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+
+  *bestp = bestnewp;
+  return bestsavings;
+}
+#endif  // CONFIG_ENTROPY
+
 void av1_cond_prob_diff_update(aom_writer *w, aom_prob *oldp,
                                const unsigned int ct[2], int probwt) {
   const aom_prob upd = DIFF_UPDATE_PROB;
@@ -200,3 +277,16 @@
       av1_prob_diff_update_savings_search(ct, *oldp, &newp, upd, probwt);
   return savings;
 }
+
+void aom_write_primitive_symmetric(aom_writer *w, int word,
+                                   unsigned int abs_bits) {
+  if (word == 0) {
+    aom_write_bit(w, 0);
+  } else {
+    const int x = abs(word);
+    const int s = word < 0;
+    aom_write_bit(w, 1);
+    aom_write_bit(w, s);
+    aom_write_literal(w, x - 1, abs_bits);
+  }
+}

diff --git a/av1/encoder/subexp.h b/av1/encoder/subexp.h
index 830afd2..d01dea9 100644
--- a/av1/encoder/subexp.h
+++ b/av1/encoder/subexp.h

@@ -35,7 +35,22 @@
 
 int av1_cond_prob_diff_update_savings(aom_prob *oldp, const unsigned int ct[2],
                                       int probwt);
+#if CONFIG_ENTROPY
+int av1_prob_update_search_subframe(unsigned int ct[][2], aom_prob oldp,
+                                    aom_prob *bestp, aom_prob upd, int n);
+int av1_prob_update_search_model_subframe(unsigned int ct[ENTROPY_NODES]
+                                                         [COEF_PROBS_BUFS][2],
+                                          const aom_prob *oldp, aom_prob *bestp,
+                                          aom_prob upd, int stepsize, int n);
+#endif  // CONFIG_ENTROPY
 
+//
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int word,
+                                   unsigned int mag_bits);
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index aaa6bd5..2285e46 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c

@@ -39,9 +39,21 @@
   const MV mv = { mv_row, mv_col };
   enum mv_precision mv_precision_uv;
   int uv_stride;
-  InterpFilter interp_filter[4] = { EIGHTTAP_SHARP, EIGHTTAP_SHARP,
-                                    EIGHTTAP_SHARP, EIGHTTAP_SHARP };
+
+#if USE_TEMPORALFILTER_12TAP
+#if CONFIG_DUAL_FILTER
+  const InterpFilter interp_filter[4] = { TEMPORALFILTER_12TAP,
+                                          TEMPORALFILTER_12TAP,
+                                          TEMPORALFILTER_12TAP,
+                                          TEMPORALFILTER_12TAP };
+#else
+  const InterpFilter interp_filter = TEMPORALFILTER_12TAP;
+#endif
   (void)xd;
+#else
+  const InterpFilter interp_filter = xd->mi[0]->mbmi.interp_filter;
+#endif  // USE_TEMPORALFILTER_12TAP
+
   if (uv_block_width == 8) {
     uv_stride = (stride + 1) >> 1;
     mv_precision_uv = MV_PRECISION_Q4;
@@ -221,7 +233,6 @@
 
   MV best_ref_mv1 = { 0, 0 };
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
@@ -249,14 +260,16 @@
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   av1_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
                  cond_cost_list(cpi, cost_list), &cpi->fn_ptr[BLOCK_16X16], 0,
-                 &best_ref_mv1, ref_mv);
+                 &best_ref_mv1);
 
   // Ignore mv costing by sending NULL pointer instead of cost array
   bestsme = cpi->find_fractional_mv_step(
-      x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv,
-      x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0,
-      mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL,
-      &distortion, &sse, NULL, 0, 0, 0);
+      x, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      0);
+
+  x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
 
   // Restore input state
   x->plane[0].src = src;
@@ -308,7 +321,7 @@
 
   for (mb_row = 0; mb_row < mb_rows; mb_row++) {
     // Source frames are extended to 16 pixels. This is different than
-    //  L/A/G reference frames that have a border of 32 (AOMENCBORDERINPIXELS)
+    //  L/A/G reference frames that have a border of 32 (AV1ENCBORDERINPIXELS)
     // A 6/8 tap filter is used for motion search.  This requires 2 pixels
     //  before and 3 pixels after.  So the largest Y mv on a border would
     //  then be 16 - AOM_INTERP_EXTEND. The UV blocks are half the size of the
@@ -370,14 +383,14 @@
           if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
-            av1_highbd_temporal_filter_apply_c(
+            av1_highbd_temporal_filter_apply(
                 f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
                 adj_strength, filter_weight, accumulator, count);
-            av1_highbd_temporal_filter_apply_c(
+            av1_highbd_temporal_filter_apply(
                 f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
                 mb_uv_width, mb_uv_height, adj_strength, filter_weight,
                 accumulator + 256, count + 256);
-            av1_highbd_temporal_filter_apply_c(
+            av1_highbd_temporal_filter_apply(
                 f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
                 mb_uv_width, mb_uv_height, adj_strength, filter_weight,
                 accumulator + 512, count + 512);
@@ -613,9 +626,31 @@
   int frames_to_blur_forward;
   struct scale_factors sf;
   YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+#if CONFIG_EXT_REFS
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+#endif
 
   // Apply context specific adjustments to the arnr filter parameters.
   adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+// TODO(weitinglin): Currently, we enforce the filtering strength on
+//                   extra ARFs' to be zeros. We should investigate in which
+//                   case it is more beneficial to use non-zero strength
+//                   filtering.
+#if CONFIG_EXT_REFS
+  if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) {
+    strength = 0;
+    frames_to_blur = 1;
+  }
+#endif
+
+#if CONFIG_EXT_REFS
+  if (strength == 0 && frames_to_blur == 1) {
+    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 1;
+  } else {
+    cpi->is_arf_filter_off[gf_group->arf_update_idx[gf_group->index]] = 0;
+  }
+#endif
+
   frames_to_blur_backward = (frames_to_blur / 2);
   frames_to_blur_forward = ((frames_to_blur - 1) / 2);
   start_frame = distance + frames_to_blur_forward;

diff --git a/av1/encoder/tokenize.c b/av1/encoder/tokenize.c
index 7efeb55..5aafa79 100644
--- a/av1/encoder/tokenize.c
+++ b/av1/encoder/tokenize.c

@@ -23,6 +23,7 @@
 
 #include "av1/encoder/cost.h"
 #include "av1/encoder/encoder.h"
+#include "av1/encoder/rdopt.h"
 #include "av1/encoder/tokenize.h"
 
 static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
@@ -50,32 +51,44 @@
     dct_cat_lt_10_value_tokens +
     (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens)) /
         2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element av1_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531, 3432, 3409, 3363, 3340, 3282,
+  3259, 3213, 3190, 3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894, 2795, 2772,
+  2726, 2703, 2645, 2622, 2576, 2553, 3197, 3116, 3058, 2977, 2881, 2800, 2742,
+  2661, 2615, 2534, 2476, 2395, 2299, 2218, 2160, 2079, 2566, 2427, 2334, 2195,
+  2023, 1884, 1791, 1652, 1893, 1696, 1453, 1256, 1229, 864,  512,  512,  512,
+  512,  0,    512,  512,  512,  512,  864,  1229, 1256, 1453, 1696, 1893, 1652,
+  1791, 1884, 2023, 2195, 2334, 2427, 2566, 2079, 2160, 2218, 2299, 2395, 2476,
+  2534, 2615, 2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197, 2553, 2576, 2622,
+  2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432, 3531, 3554, 3600, 3623, 3681,
+  3704, 3750, 3773,
+};
+const int *av1_dct_cat_lt_10_value_cost =
+    dct_cat_lt_10_value_cost +
+    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost)) / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
+/* clang-format off */
 const aom_tree_index av1_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  -EOB_TOKEN,
-  2,  // 0  = EOB
-  -ZERO_TOKEN,
-  4,  // 1  = ZERO
-  -ONE_TOKEN,
-  6,  // 2  = ONE
-  8,
-  12,  // 3  = LOW_VAL
-  -TWO_TOKEN,
-  10,  // 4  = TWO
-  -THREE_TOKEN,
-  -FOUR_TOKEN,  // 5  = THREE
-  14,
-  16,  // 6  = HIGH_LOW
-  -CATEGORY1_TOKEN,
-  -CATEGORY2_TOKEN,  // 7  = CAT_ONE
-  18,
-  20,  // 8  = CAT_THREEFOUR
-  -CATEGORY3_TOKEN,
-  -CATEGORY4_TOKEN,  // 9  = CAT_THREE
-  -CATEGORY5_TOKEN,
-  -CATEGORY6_TOKEN  // 10 = CAT_FIVE
+  -EOB_TOKEN, 2,                       // 0  = EOB
+  -ZERO_TOKEN, 4,                      // 1  = ZERO
+  -ONE_TOKEN, 6,                       // 2  = ONE
+  8, 12,                               // 3  = LOW_VAL
+  -TWO_TOKEN, 10,                      // 4  = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
+  14, 16,                              // 6  = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
+  18, 20,                              // 8  = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
 };
+/* clang-format on */
 
 static const int16_t zero_cost[] = { 0 };
 static const int16_t sign_cost[1] = { 512 };
@@ -293,18 +306,44 @@
 };
 #endif
 
+#if !CONFIG_RANS
 const struct av1_token av1_coef_encodings[ENTROPY_TOKENS] = {
   { 2, 2 },  { 6, 3 },   { 28, 5 },  { 58, 6 },  { 59, 6 },  { 60, 6 },
   { 61, 6 }, { 124, 7 }, { 125, 7 }, { 126, 7 }, { 127, 7 }, { 0, 1 }
 };
+#endif  // !CONFIG_RANS
 
 struct tokenize_b_args {
   const AV1_COMP *cpi;
   ThreadData *td;
   TOKENEXTRA **tp;
+  int this_rate;
 };
 
 #if !CONFIG_PVQ
+static void cost_coeffs_b(int plane, int block, int blk_row, int blk_col,
+                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+  struct tokenize_b_args *const args = arg;
+  const AV1_COMMON *cm = &args->cpi->common;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
+  struct macroblockd_plane *pd = &xd->plane[plane];
+  const PLANE_TYPE type = pd->plane_type;
+  const int ref = is_inter_block(mbmi);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type, ref);
+  int pt = get_entropy_context(tx_size, pd->above_context + blk_col,
+                               pd->left_context + blk_row);
+  int rate = av1_cost_coeffs(cm, x, plane, block, pt, tx_size, scan_order->scan,
+                             scan_order->neighbors, 0);
+  args->this_rate += rate;
+  (void)plane_bsize;
+  av1_set_contexts(xd, pd, tx_size, p->eobs[block] > 0, blk_col, blk_row);
+}
+
 static void set_entropy_context_b(int plane, int block, int blk_row,
                                   int blk_col, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
@@ -337,13 +376,15 @@
 
 static INLINE int get_tx_eob(const struct segmentation *seg, int segment_id,
                              TX_SIZE tx_size) {
-  const int eob_max = 1 << (tx_size_1d_log2[tx_size] * 2);
+  const int eob_max = tx_size_2d[tx_size];
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
 #if CONFIG_PALETTE
-void av1_tokenize_palette_sb(const struct ThreadData *const td,
-                             BLOCK_SIZE bsize, int plane, TOKENEXTRA **t) {
+void av1_tokenize_palette_sb(const AV1_COMP *cpi,
+                             const struct ThreadData *const td, int plane,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate) {
   const MACROBLOCK *const x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
@@ -351,6 +392,7 @@
   const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   const int n = pmi->palette_size[plane != 0];
   int i, j;
+  int this_rate = 0;
   uint8_t color_order[PALETTE_MAX_SIZE];
   const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
                    (xd->plane[plane != 0].subsampling_y);
@@ -366,12 +408,15 @@
       const int color_ctx = av1_get_palette_color_context(
           color_map, cols, i, j, n, color_order, &color_new_idx);
       assert(color_new_idx >= 0 && color_new_idx < n);
+      if (dry_run == DRY_RUN_COSTCOEFFS)
+        this_rate += cpi->palette_y_color_cost[n - 2][color_ctx][color_new_idx];
       (*t)->token = color_new_idx;
       (*t)->context_tree = probs[n - 2][color_ctx];
       (*t)->skip_eob_node = 0;
       ++(*t);
     }
   }
+  if (rate) *rate += this_rate;
 }
 #endif  // CONFIG_PALETTE
 
@@ -384,7 +429,7 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   TOKENEXTRA **tp = args->tp;
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -394,23 +439,35 @@
   int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+#if CONFIG_SUPERTX
+  const int segment_id = AOMMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+#else
   const int segment_id = mbmi->segment_id;
+#endif  // CONFIG_SUEPRTX
   const int16_t *scan, *nb;
-  const TX_TYPE tx_type = get_tx_type(type, xd, block);
-  const SCAN_ORDER *const scan_order = get_scan(cm, tx_size, tx_type);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const SCAN_ORDER *const scan_order =
+      get_scan(cm, tx_size, tx_type, is_inter_block(mbmi));
   const int ref = is_inter_block(mbmi);
   unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      td->rd_counts.coef_counts[tx_size][type][ref];
+      td->rd_counts.coef_counts[txsize_sqr_map[tx_size]][type][ref];
+#if CONFIG_ENTROPY
+  const aom_prob(*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx]
+                                        [txsize_sqr_map[tx_size]][type][ref];
+#else
   aom_prob(*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      cpi->common.fc->coef_probs[tx_size][type][ref];
+      cpi->common.fc->coef_probs[txsize_sqr_map[tx_size]][type][ref];
+#endif  // CONFIG_ENTROPY
 #if CONFIG_EC_MULTISYMBOL
   aom_cdf_prob(*const coef_cdfs)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       cpi->common.fc->coef_cdfs[tx_size][type][ref];
 #endif
   unsigned int(*const eob_branch)[COEFF_CONTEXTS] =
-      td->counts->eob_branch[tx_size][type][ref];
+      td->counts->eob_branch[txsize_sqr_map[tx_size]][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int skip_eob = 0;
   int16_t token;
   EXTRABIT extra;
   (void)plane_bsize;
@@ -421,24 +478,8 @@
   c = 0;
 
   while (c < eob) {
-    int v = 0;
-    int skip_eob = 0;
-    v = qcoeff[scan[c]];
-
-    while (!v) {
-      add_token(&t, coef_probs[band[c]][pt],
-#if CONFIG_EC_MULTISYMBOL
-                &coef_cdfs[band[c]][pt],
-#endif
-                0, ZERO_TOKEN, skip_eob, counts[band[c]][pt]);
-      eob_branch[band[c]][pt] += !skip_eob;
-
-      skip_eob = 1;
-      token_cache[scan[c]] = 0;
-      ++c;
-      pt = get_coef_context(nb, token_cache, c);
-      v = qcoeff[scan[c]];
-    }
+    const int v = qcoeff[scan[c]];
+    eob_branch[band[c]][pt] += !skip_eob;
 
     av1_get_token_extra(v, &token, &extra);
 
@@ -447,11 +488,11 @@
               &coef_cdfs[band[c]][pt],
 #endif
               extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
-    eob_branch[band[c]][pt] += !skip_eob;
 
     token_cache[scan[c]] = av1_pt_energy_class[token];
     ++c;
     pt = get_coef_context(nb, token_cache, c);
+    skip_eob = (token == ZERO_TOKEN);
   }
   if (c < seg_eob) {
     add_token(&t, coef_probs[band[c]][pt],
@@ -521,7 +562,6 @@
                                          has_high_freq_coeff, &args);
   return result;
 }
-
 #if CONFIG_PVQ
 void add_pvq_block(AV1_COMMON *const cm, MACROBLOCK *const x, PVQ_INFO *pvq) {
   PVQ_QUEUE *q = x->pvq_q;
@@ -558,9 +598,119 @@
   add_pvq_block((AV1_COMMON * const)cm, x, pvq_info);
 }
 #endif
+#if CONFIG_VAR_TX
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+                    TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
+                    int blk_col, int block, int plane, void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+  const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+  TX_SIZE plane_tx_size;
+
+  assert(tx_size < TX_SIZES);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+  plane_tx_size =
+      plane ? uv_txsize_lookup[bsize][mbmi->inter_tx_size[tx_row][tx_col]][0][0]
+            : mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (tx_size == plane_tx_size) {
+    plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
+    if (!dry_run)
+      tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+    else if (dry_run == DRY_RUN_NORMAL)
+      set_entropy_context_b(plane, block, blk_row, blk_col, plane_bsize,
+                            tx_size, arg);
+    else if (dry_run == DRY_RUN_COSTCOEFFS)
+      cost_coeffs_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+  } else {
+    // Half the block size in transform block unit.
+    const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+    const int bsl = tx_size_wide_unit[sub_txs];
+    int i;
+
+    assert(bsl > 0);
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) * bsl);
+      const int offsetc = blk_col + ((i & 0x01) * bsl);
+
+      int step = tx_size_wide_unit[sub_txs] * tx_size_high_unit[sub_txs];
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+      tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
+                     block, plane, arg);
+      block += step;
+    }
+  }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                           RUN_TYPE dry_run, int mi_row, int mi_col,
+                           BLOCK_SIZE bsize, int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = av1_get_skip_context(xd);
+  const int skip_inc =
+      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
+  int plane;
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (mbmi->skip) {
+    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run) *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    td->counts->skip[ctx][0] += skip_inc;
+  else
+    *t = t_backup;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+    const int mi_height = block_size_high[plane_bsize] >> tx_size_wide_log2[0];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+    int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+    int idx, idy;
+    int block = 0;
+    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bw) {
+        tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx,
+                       block, plane, &arg);
+        block += step;
+      }
+    }
+
+    if (!dry_run) {
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  }
+  if (rate) *rate += arg.this_rate;
+}
+#endif  // CONFIG_VAR_TX
 
 void av1_tokenize_sb(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
-                     int dry_run, BLOCK_SIZE bsize) {
+                     RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate) {
   const AV1_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -568,7 +718,7 @@
   const int ctx = av1_get_skip_context(xd);
   const int skip_inc =
       !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
-  struct tokenize_b_args arg = { cpi, td, t };
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
   if (mbmi->skip) {
     if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
@@ -586,8 +736,10 @@
       (*t)->token = EOSB_TOKEN;
       (*t)++;
     }
-  } else {
+  } else if (dry_run == DRY_RUN_NORMAL) {
     av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
   }
 #else
   if (!dry_run) {
@@ -600,4 +752,44 @@
                                              &arg);
   }
 #endif
+  if (rate) *rate += arg.this_rate;
 }
+
+#if CONFIG_SUPERTX
+void av1_tokenize_sb_supertx(const AV1_COMP *cpi, ThreadData *td,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate) {
+  const AV1_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = av1_get_skip_context(xd);
+  const int skip_inc =
+      !segfeature_active(&cm->seg, mbmi->segment_id_supertx, SEG_LVL_SKIP);
+  struct tokenize_b_args arg = { cpi, td, t, 0 };
+  if (mbmi->skip) {
+    if (!dry_run) td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run) *t = t_backup;
+    return;
+  }
+
+  if (!dry_run) {
+    int plane;
+    td->counts->skip[ctx][0] += skip_inc;
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      av1_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                             &arg);
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  } else if (dry_run == DRY_RUN_NORMAL) {
+    av1_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+    *t = t_backup;
+  } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+    av1_foreach_transformed_block(xd, bsize, cost_coeffs_b, &arg);
+  }
+  if (rate) *rate += arg.this_rate;
+}
+#endif  // CONFIG_SUPERTX

diff --git a/av1/encoder/tokenize.h b/av1/encoder/tokenize.h
index 0695567..27bdef5 100644
--- a/av1/encoder/tokenize.h
+++ b/av1/encoder/tokenize.h

@@ -46,7 +46,9 @@
 
 extern const aom_tree_index av1_coef_tree[];
 extern const aom_tree_index av1_coef_con_tree[];
+#if !CONFIG_RANS
 extern const struct av1_token av1_coef_encodings[];
+#endif  // !CONFIG_RANS
 
 int av1_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 int av1_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
@@ -54,13 +56,34 @@
 struct AV1_COMP;
 struct ThreadData;
 
-#if CONFIG_PALETTE
-void av1_tokenize_palette_sb(const struct ThreadData *const td,
-                             BLOCK_SIZE bsize, int plane, TOKENEXTRA **t);
-#endif  // CONFIG_PALETTE
+typedef enum {
+  OUTPUT_ENABLED = 0,
+  DRY_RUN_NORMAL,
+  DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
 
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+#if CONFIG_VAR_TX
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+                           TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+                           int mi_col, BLOCK_SIZE bsize, int *rate);
+#endif
+#if CONFIG_PALETTE
+void av1_tokenize_palette_sb(const struct AV1_COMP *cpi,
+                             const struct ThreadData *const td, int plane,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate);
+#endif  // CONFIG_PALETTE
 void av1_tokenize_sb(const struct AV1_COMP *cpi, struct ThreadData *td,
-                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+                     TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                     int *rate);
+#if CONFIG_SUPERTX
+void av1_tokenize_sb_supertx(const struct AV1_COMP *cpi, struct ThreadData *td,
+                             TOKENEXTRA **t, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+                             int *rate);
+#endif
 
 extern const int16_t *av1_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
@@ -69,6 +92,7 @@
  */
 extern const TOKENVALUE *av1_dct_value_tokens_ptr;
 extern const TOKENVALUE *av1_dct_cat_lt_10_value_tokens;
+extern const int *av1_dct_cat_lt_10_value_cost;
 extern const int16_t av1_cat6_low_cost[256];
 extern const int av1_cat6_high_cost[64];
 extern const int av1_cat6_high10_high_cost[256];
@@ -111,6 +135,19 @@
   return av1_dct_cat_lt_10_value_tokens[v].token;
 }
 
+static INLINE int av1_get_token_cost(int v, int16_t *token,
+                                     const int *cat6_high_table) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    EXTRABIT extrabits;
+    *token = CATEGORY6_TOKEN;
+    extrabits = abs(v) - CAT6_MIN_VAL;
+    return av1_cat6_low_cost[extrabits & 0xff] +
+           cat6_high_table[extrabits >> 8];
+  }
+  *token = av1_dct_cat_lt_10_value_tokens[v].token;
+  return av1_dct_cat_lt_10_value_cost[v];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/av1/encoder/variance_tree.c b/av1/encoder/variance_tree.c
new file mode 100644
index 0000000..3a23027
--- /dev/null
+++ b/av1/encoder/variance_tree.c

@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "av1/encoder/variance_tree.h"
+#include "av1/encoder/encoder.h"
+
+void av1_setup_var_tree(struct AV1Common *cm, ThreadData *td) {
+  int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 1024;
+  const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
+  int index = 0;
+  VAR_TREE *this_var;
+  int nodes;
+
+  aom_free(td->var_tree);
+  CHECK_MEM_ERROR(cm, td->var_tree,
+                  aom_calloc(tree_nodes, sizeof(*td->var_tree)));
+
+  this_var = &td->var_tree[0];
+
+  // Sets up all the leaf nodes in the tree.
+  for (index = 0; index < leaf_nodes; ++index) {
+    VAR_TREE *const leaf = &td->var_tree[index];
+    leaf->split[0] = NULL;
+  }
+
+  // Each node has 4 leaf nodes, fill in the child pointers
+  // from leafs to the root.
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i, ++index) {
+      VAR_TREE *const node = &td->var_tree[index];
+      for (j = 0; j < 4; j++) node->split[j] = this_var++;
+    }
+  }
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->var_root[i] = &td->var_tree[tree_nodes - 1];
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->var_root[i] = td->var_root[i + 1]->split[0];
+  }
+}
+
+void av1_free_var_tree(ThreadData *td) {
+  aom_free(td->var_tree);
+  td->var_tree = NULL;
+}

diff --git a/av1/encoder/variance_tree.h b/av1/encoder/variance_tree.h
new file mode 100644
index 0000000..6397084
--- /dev/null
+++ b/av1/encoder/variance_tree.h

@@ -0,0 +1,95 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_ENCODER_VARIANCE_TREE_H_
+#define AV1_ENCODER_VARIANCE_TREE_H_
+
+#include <assert.h>
+
+#include "./aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct ThreadData;
+
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int log2_count;
+  int variance;
+} VAR;
+
+typedef struct {
+  VAR none;
+  VAR horz[2];
+  VAR vert[2];
+} partition_variance;
+
+typedef struct VAR_TREE {
+  int force_split;
+  partition_variance variances;
+  struct VAR_TREE *split[4];
+  BLOCK_SIZE bsize;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int width;
+  int height;
+#if CONFIG_AOM_HIGHBITDEPTH
+  int highbd;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+} VAR_TREE;
+
+void av1_setup_var_tree(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_var_tree(struct ThreadData *td);
+
+// Set variance values given sum square error, sum error, count.
+static INLINE void fill_variance(int64_t s2, int64_t s, int c, VAR *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+  v->variance =
+      (int)(256 * (v->sum_square_error -
+                   ((v->sum_error * v->sum_error) >> v->log2_count)) >>
+            v->log2_count);
+}
+
+static INLINE void sum_2_variances(const VAR *a, const VAR *b, VAR *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static INLINE void fill_variance_node(VAR_TREE *vt) {
+  sum_2_variances(&vt->split[0]->variances.none, &vt->split[1]->variances.none,
+                  &vt->variances.horz[0]);
+  sum_2_variances(&vt->split[2]->variances.none, &vt->split[3]->variances.none,
+                  &vt->variances.horz[1]);
+  sum_2_variances(&vt->split[0]->variances.none, &vt->split[2]->variances.none,
+                  &vt->variances.vert[0]);
+  sum_2_variances(&vt->split[1]->variances.none, &vt->split[3]->variances.none,
+                  &vt->variances.vert[1]);
+  sum_2_variances(&vt->variances.vert[0], &vt->variances.vert[1],
+                  &vt->variances.none);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* AV1_ENCODER_VARIANCE_TREE_H_ */

diff --git a/av1/encoder/wedge_utils.c b/av1/encoder/wedge_utils.c
new file mode 100644
index 0000000..596c5df
--- /dev/null
+++ b/av1/encoder/wedge_utils.c

@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1:  Residuals of p1.
+ *      (source - p1)
+ * d:   Difference of p1 and p0.
+ *      (p1 - p0)
+ * m:   The blending mask
+ * N:   Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ *    is equivalent to:
+ *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ *    which is the SSE of the residuals of the compound predictor scaled up by
+ *    MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+                                        const uint8_t *m, int N) {
+  uint64_t csse = 0;
+  int i;
+  assert(N % 64 == 0);
+  for (i = 0; i < N; i++) {
+    int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+    t = clamp(t, INT16_MIN, INT16_MAX);
+    csse += t * t;
+  }
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds:    Difference of the squares of the residuals.
+ *        r0**2 - r1**2
+ * m:     The blending mask
+ * N:     Number of pixels
+ * limit: Pre-computed threshold value.
+ *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ *                                     >
+ *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ *  which can be simplified to:
+ *
+ *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ *  The right hand side does not depend on the mask, and needs to be passed as
+ *  the 'limit' parameter.
+ *
+ *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ *  hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ *  being small, this should not cause a noticeable issue.
+ */
+int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
+                                    int64_t limit) {
+  int64_t acc = 0;
+
+  assert(N % 64 == 0);
+
+  do {
+    acc += *ds++ * *m++;
+  } while (--N);
+
+  return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+                                       const int16_t *b, int N) {
+  int i;
+
+  assert(N % 64 == 0);
+
+  for (i = 0; i < N; i++)
+    d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}

diff --git a/av1/encoder/x86/av1_highbd_quantize_sse4.c b/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 0000000..dd3405f
--- /dev/null
+++ b/av1/encoder/x86/av1_highbd_quantize_sse4.c

@@ -0,0 +1,192 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "./av1_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+                                         const int shift, const int scale,
+                                         __m128i *qcoeff, __m128i *dquan,
+                                         __m128i *sign) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+
+  *sign = _mm_cmplt_epi32(*coeff, zero);
+  *sign = _mm_or_si128(*sign, one);
+  *coeff = _mm_abs_epi32(*coeff);
+
+  qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+  qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+  qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+  qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+  qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+  dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+  dquan[0] = _mm_srli_epi64(dquan[0], scale);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+                                         const __m128i *sign,
+                                         const __m128i *param, const int shift,
+                                         const int scale, tran_low_t *qAddr,
+                                         tran_low_t *dqAddr) {
+  __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+  __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+  qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+  qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+  dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+  dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+  // combine L&H
+  qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+  qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+  qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+  qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+  dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+  dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+  dquan[0] = _mm_and_si128(dquan[0], mask0H);
+  dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+  qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+  dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+  qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+  dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+
+  _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+  _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+                            __m128i *eob) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, iscanIdx;
+  const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+  const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+  __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+  __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+  nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+  nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+  mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+  iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+  iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+  iscanIdx = _mm_and_si128(iscanIdx, mask);
+  *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+  __m128i eob_shuffled;
+  uint16_t eobValue;
+  eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+  *eob = _mm_max_epi16(*eob, eob_shuffled);
+  eobValue = _mm_extract_epi16(*eob, 0);
+  return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, int log_scale) {
+  __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
+  __m128i eob = _mm_setzero_si128();
+  const tran_low_t *src = coeff_ptr;
+  tran_low_t *quanAddr = qcoeff_ptr;
+  tran_low_t *dquanAddr = dqcoeff_ptr;
+  const int shift = 16 - log_scale;
+  const int coeff_stride = 4;
+  const int quan_stride = coeff_stride;
+  (void)skip_block;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)scan;
+
+  memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+  memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+  if (!skip_block) {
+    coeff[0] = _mm_loadu_si128((__m128i const *)src);
+
+    qparam[0] =
+        _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
+    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
+    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
+
+    // DC and first 3 AC
+    quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+
+    // update round/quan/dquan for AC
+    qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+    qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
+    qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
+
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr, dquanAddr);
+
+    // next 4 AC
+    coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+    quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+                          &coeff_sign);
+    quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                          log_scale, quanAddr + quan_stride,
+                          dquanAddr + quan_stride);
+
+    find_eob(quanAddr, iscan, &eob);
+
+    count -= 8;
+
+    // loop for the rest of AC
+    while (count > 0) {
+      src += coeff_stride << 1;
+      quanAddr += quan_stride << 1;
+      dquanAddr += quan_stride << 1;
+      iscan += quan_stride << 1;
+
+      coeff[0] = _mm_loadu_si128((__m128i const *)src);
+      coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+      quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
+                            dequant, &coeff_sign);
+      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                            log_scale, quanAddr, dquanAddr);
+
+      quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
+                            dequant, &coeff_sign);
+      quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+                            log_scale, quanAddr + quan_stride,
+                            dquanAddr + quan_stride);
+
+      find_eob(quanAddr, iscan, &eob);
+
+      count -= 8;
+    }
+    *eob_ptr = get_accumulated_eob(&eob);
+  } else {
+    *eob_ptr = 0;
+  }
+}

diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 1daa2d1..3596292 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c

@@ -12,23 +12,38 @@
 #include <assert.h>
 #include <emmintrin.h>  // SSE2
 
-#include "./av1_rtcd.h"
 #include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
 #include "aom_dsp/txfm_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
 #include "aom_dsp/x86/txfm_common_sse2.h"
 #include "aom_ports/mem.h"
 
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
-                                   int stride) {
+                                   int stride, int flipud, int fliplr) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i mask;
 
-  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 4);
   in[1] = _mm_slli_epi16(in[1], 4);
@@ -152,6 +167,41 @@
   transpose_4x4(in);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx4_sse2(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3;
+  __m128i u0, u1, u2, u3;
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+
+  u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u2);
+  in[1] = _mm_packs_epi32(u1, u3);
+  transpose_4x4(in);
+}
+#endif  // CONFIG_EXT_TX
+
 void av1_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
                      int tx_type) {
   __m128i in[4];
@@ -159,24 +209,92 @@
   switch (tx_type) {
     case DCT_DCT: aom_fdct4x4_sse2(input, output, stride); break;
     case ADST_DCT:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fadst4_sse2(in);
       fdct4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case DCT_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fdct4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case ADST_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fadst4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    default: assert(0); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fidtx4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
   }
 }
 
@@ -623,15 +741,37 @@
 
 // load 8x8 array
 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
-                                   int stride) {
-  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 2);
   in[1] = _mm_slli_epi16(in[1], 2);
@@ -1131,6 +1271,21 @@
   array_transpose_8x8(in, in);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx8_sse2(__m128i *in) {
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+
+  array_transpose_8x8(in, in);
+}
+#endif  // CONFIG_EXT_TX
+
 void av1_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
                      int tx_type) {
   __m128i in[8];
@@ -1138,40 +1293,149 @@
   switch (tx_type) {
     case DCT_DCT: aom_fdct8x8_sse2(input, output, stride); break;
     case ADST_DCT:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       fdct8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case DCT_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fdct8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case ADST_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    default: assert(0); break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdct8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
   }
 }
 
 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
-                                     __m128i *in1, int stride) {
-  // load first 8 columns
-  load_buffer_8x8(input, in0, stride);
-  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+                                     __m128i *in1, int stride, int flipud,
+                                     int fliplr) {
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
 
-  input += 8;
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, in0, stride, flipud, fliplr);
+  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
+
   // load second 8 columns
-  load_buffer_8x8(input, in1, stride);
-  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+  load_buffer_8x8(topR, in1, stride, flipud, fliplr);
+  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
 }
 
 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
@@ -1217,7 +1481,7 @@
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -1421,10 +1685,10 @@
 
   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
 
@@ -1454,10 +1718,10 @@
   // stage 5
   s[0] = _mm_add_epi16(p[0], t[1]);
   s[1] = _mm_sub_epi16(p[0], t[1]);
-  s[2] = _mm_add_epi16(p[3], t[2]);
-  s[3] = _mm_sub_epi16(p[3], t[2]);
-  s[4] = _mm_sub_epi16(p[4], t[5]);
-  s[5] = _mm_add_epi16(p[4], t[5]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
   s[6] = _mm_sub_epi16(p[7], t[6]);
   s[7] = _mm_add_epi16(p[7], t[6]);
 
@@ -2014,33 +2278,1504 @@
   array_transpose_16x16(in0, in1);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx16_8col(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+  in[8] = _mm_slli_epi16(in[8], 1);
+  in[9] = _mm_slli_epi16(in[9], 1);
+  in[10] = _mm_slli_epi16(in[10], 1);
+  in[11] = _mm_slli_epi16(in[11], 1);
+  in[12] = _mm_slli_epi16(in[12], 1);
+  in[13] = _mm_slli_epi16(in[13], 1);
+  in[14] = _mm_slli_epi16(in[14], 1);
+  in[15] = _mm_slli_epi16(in[15], 1);
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
+  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
+  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
+  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
+
+  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
+  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
+  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
+  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
+  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
+  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
+  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
+  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
+
+  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
+  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
+  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
+  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
+  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
+  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
+  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
+  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
+
+  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
+  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
+  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
+  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
+  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
+  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
+  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
+  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
+
+  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
+  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
+  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
+  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
+
+  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
+  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
+  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
+  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
+  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
+  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
+  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
+  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
+
+  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
+  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
+  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
+  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
+  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
+
+  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
+  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
+  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
+  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
+  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
+  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
+  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
+  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
+  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
+  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
+  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(v0, x0);
+  in[1] = _mm_packs_epi32(v1, x1);
+  in[2] = _mm_packs_epi32(v2, x2);
+  in[3] = _mm_packs_epi32(v3, x3);
+  in[4] = _mm_packs_epi32(v4, x4);
+  in[5] = _mm_packs_epi32(v5, x5);
+  in[6] = _mm_packs_epi32(v6, x6);
+  in[7] = _mm_packs_epi32(v7, x7);
+
+  in[8] = _mm_packs_epi32(u0, y0);
+  in[9] = _mm_packs_epi32(u1, y1);
+  in[10] = _mm_packs_epi32(u2, y2);
+  in[11] = _mm_packs_epi32(u3, y3);
+  in[12] = _mm_packs_epi32(u4, y4);
+  in[13] = _mm_packs_epi32(u5, y5);
+  in[14] = _mm_packs_epi32(u6, y6);
+  in[15] = _mm_packs_epi32(u7, y7);
+}
+
+static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
+  fidtx16_8col(in0);
+  fidtx16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
 void av1_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
                        int tx_type) {
   __m128i in0[16], in1[16];
 
   switch (tx_type) {
-    case DCT_DCT: aom_fdct16x16_sse2(input, output, stride); break;
+    case DCT_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
     case ADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fdct16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case DCT_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case ADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+#endif  // CONFIG_EXT_TX
     default: assert(0); break;
   }
 }
+
+static INLINE void scale_sqrt2_8x4(__m128i *in) {
+  // Implements fdct_round_shift(input * Sqrt2), which is equivalent to
+  // ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS),
+  // for 32 consecutive elements.
+  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
+  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
+  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
+  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
+
+  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
+  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
+
+  in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+  in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+  in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
+  in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
+}
+
+static INLINE void scale_sqrt2_8x8_signed(__m128i *in) {
+  // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
+  // for each element
+  const __m128i v_scale_w = _mm_set1_epi16(Sqrt2);
+
+  const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
+  const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
+  const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
+  const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
+  const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
+  const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
+  const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
+  const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
+  const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
+  const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
+  const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
+  const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
+  const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
+  const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
+  const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
+  const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
+
+  const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
+  const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
+  const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
+  const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
+  const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
+  const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
+  const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
+  const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
+  const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
+  const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
+  const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
+  const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
+  const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
+
+  in[0] = _mm_packs_epi32(xx_roundn_epi32(v_p0a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p0b_d, DCT_CONST_BITS));
+  in[1] = _mm_packs_epi32(xx_roundn_epi32(v_p1a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p1b_d, DCT_CONST_BITS));
+  in[2] = _mm_packs_epi32(xx_roundn_epi32(v_p2a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p2b_d, DCT_CONST_BITS));
+  in[3] = _mm_packs_epi32(xx_roundn_epi32(v_p3a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p3b_d, DCT_CONST_BITS));
+  in[4] = _mm_packs_epi32(xx_roundn_epi32(v_p4a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p4b_d, DCT_CONST_BITS));
+  in[5] = _mm_packs_epi32(xx_roundn_epi32(v_p5a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p5b_d, DCT_CONST_BITS));
+  in[6] = _mm_packs_epi32(xx_roundn_epi32(v_p6a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p6b_d, DCT_CONST_BITS));
+  in[7] = _mm_packs_epi32(xx_roundn_epi32(v_p7a_d, DCT_CONST_BITS),
+                          xx_roundn_epi32(v_p7b_d, DCT_CONST_BITS));
+}
+
+static INLINE void scale_sqrt2_8x16(__m128i *in) {
+  scale_sqrt2_8x4(in);
+  scale_sqrt2_8x4(in + 4);
+  scale_sqrt2_8x4(in + 8);
+  scale_sqrt2_8x4(in + 12);
+}
+
+// Load input into the left-hand half of in (ie, into lanes 0..3 of
+// each element of in). The right hand half (lanes 4..7) should be
+// treated as being filled with "don't care" values.
+static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+    in[4] = _mm_shufflelo_epi16(in[4], 0x1b);
+    in[5] = _mm_shufflelo_epi16(in[5], 0x1b);
+    in[6] = _mm_shufflelo_epi16(in[6], 0x1b);
+    in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
+  }
+
+  in[0] = _mm_slli_epi16(in[0], 3);
+  in[1] = _mm_slli_epi16(in[1], 3);
+  in[2] = _mm_slli_epi16(in[2], 3);
+  in[3] = _mm_slli_epi16(in[3], 3);
+  in[4] = _mm_slli_epi16(in[4], 3);
+  in[5] = _mm_slli_epi16(in[5], 3);
+  in[6] = _mm_slli_epi16(in[6], 3);
+  in[7] = _mm_slli_epi16(in[7], 3);
+
+  scale_sqrt2_8x4(in);
+  scale_sqrt2_8x4(in + 4);
+}
+
+static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i in45 = _mm_unpacklo_epi64(res[4], res[5]);
+  __m128i in67 = _mm_unpacklo_epi64(res[6], res[7]);
+
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  __m128i out45 = _mm_add_epi16(in45, kOne);
+  __m128i out67 = _mm_add_epi16(in67, kOne);
+
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  out45 = _mm_srai_epi16(out45, 2);
+  out67 = _mm_srai_epi16(out67, 2);
+
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
+  store_output(&out45, (output + 2 * 8));
+  store_output(&out67, (output + 3 * 8));
+}
+
+void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[8];
+
+  load_buffer_4x8(input, in, stride, 0, 0);
+  switch (tx_type) {
+    case DCT_DCT:
+      fdct8_sse2(in);
+      // Repack data into two 4x4 blocks so we can reuse the 4x4 transforms
+      // The other cases (and the 8x4 transforms) all behave similarly
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      break;
+    case ADST_DCT:
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      break;
+    case DCT_ADST:
+      fdct8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+    case ADST_ADST:
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x8(input, in, stride, 0, 1);
+      fdct8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x8(input, in, stride, 1, 1);
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x8(input, in, stride, 0, 1);
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+    case IDTX:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      break;
+    case V_DCT:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fdct8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      break;
+    case H_DCT:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      break;
+    case V_ADST:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      break;
+    case H_ADST:
+      load_buffer_4x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x8(input, in, stride, 0, 1);
+      fidtx8_sse2(in);
+      in[4] = _mm_shuffle_epi32(in[0], 0xe);
+      in[5] = _mm_shuffle_epi32(in[1], 0xe);
+      in[6] = _mm_shuffle_epi32(in[2], 0xe);
+      in[7] = _mm_shuffle_epi32(in[3], 0xe);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_4x8(output, in);
+}
+
+// Load input into the left-hand half of in (ie, into lanes 0..3 of
+// each element of in). The right hand half (lanes 4..7) should be
+// treated as being filled with "don't care" values.
+// The input is split horizontally into two 4x4
+// chunks 'l' and 'r'. Then 'l' is stored in the top-left 4x4
+// block of 'in' and 'r' is stored in the bottom-left block.
+// This is to allow us to reuse 4x4 transforms.
+static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+  }
+
+  in[0] = _mm_slli_epi16(in[0], 3);
+  in[1] = _mm_slli_epi16(in[1], 3);
+  in[2] = _mm_slli_epi16(in[2], 3);
+  in[3] = _mm_slli_epi16(in[3], 3);
+
+  scale_sqrt2_8x4(in);
+
+  in[4] = _mm_shuffle_epi32(in[0], 0xe);
+  in[5] = _mm_shuffle_epi32(in[1], 0xe);
+  in[6] = _mm_shuffle_epi32(in[2], 0xe);
+  in[7] = _mm_shuffle_epi32(in[3], 0xe);
+}
+
+static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+
+  __m128i out0 = _mm_add_epi16(res[0], kOne);
+  __m128i out1 = _mm_add_epi16(res[1], kOne);
+  __m128i out2 = _mm_add_epi16(res[2], kOne);
+  __m128i out3 = _mm_add_epi16(res[3], kOne);
+  out0 = _mm_srai_epi16(out0, 2);
+  out1 = _mm_srai_epi16(out1, 2);
+  out2 = _mm_srai_epi16(out2, 2);
+  out3 = _mm_srai_epi16(out3, 2);
+
+  store_output(&out0, (output + 0 * 8));
+  store_output(&out1, (output + 1 * 8));
+  store_output(&out2, (output + 2 * 8));
+  store_output(&out3, (output + 3 * 8));
+}
+
+void av1_fht8x4_sse2(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  __m128i in[8];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case ADST_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case DCT_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case ADST_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case IDTX:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case V_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fdct4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case H_DCT:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fdct8_sse2(in);
+      break;
+    case V_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case H_ADST:
+      load_buffer_8x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in + 4);
+      fidtx8_sse2(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x4(input, in, stride, 0, 1);
+      fidtx4_sse2(in);
+      fidtx4_sse2(in + 4);
+      fadst8_sse2(in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_8x4(output, in);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
+                                    int stride, int flipud, int fliplr) {
+  // Load 2 8x8 blocks
+  const int16_t *t = input;
+  const int16_t *b = input + 8 * stride;
+
+  if (flipud) {
+    const int16_t *const tmp = t;
+    t = b;
+    b = tmp;
+  }
+
+  load_buffer_8x8(t, in, stride, flipud, fliplr);
+  scale_sqrt2_8x8_signed(in);
+  load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
+  scale_sqrt2_8x8_signed(in + 8);
+}
+
+void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+                      int tx_type) {
+  __m128i in[16];
+
+  __m128i *const t = in;      // Alias to top 8x8 sub block
+  __m128i *const b = in + 8;  // Alias to bottom 8x8 sub block
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fdct16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      break;
+    case ADST_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      break;
+    case DCT_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fdct16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+    case ADST_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x16(input, in, stride, 1, 0);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x16(input, in, stride, 0, 1);
+      fdct16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x16(input, in, stride, 1, 1);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x16(input, in, stride, 0, 1);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x16(input, in, stride, 1, 0);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+    case IDTX:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fidtx16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      break;
+    case V_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fdct16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      break;
+    case H_DCT:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fidtx16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fdct8_sse2(t);
+      fdct8_sse2(b);
+      break;
+    case V_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      break;
+    case H_ADST:
+      load_buffer_8x16(input, in, stride, 0, 0);
+      fidtx16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x16(input, in, stride, 1, 0);
+      fadst16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fidtx8_sse2(t);
+      fidtx8_sse2(b);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x16(input, in, stride, 0, 1);
+      fidtx16_8col(in);
+      array_transpose_8x8(t, t);
+      array_transpose_8x8(b, b);
+      fadst8_sse2(t);
+      fadst8_sse2(b);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  right_shift_8x8(t, 2);
+  right_shift_8x8(b, 2);
+  write_buffer_8x8(output, t, 8);
+  write_buffer_8x8(output + 64, b, 8);
+}
+
+static INLINE void load_buffer_16x8(const int16_t *input, __m128i *in,
+                                    int stride, int flipud, int fliplr) {
+  // Load 2 8x8 blocks
+  const int16_t *l = input;
+  const int16_t *r = input + 8;
+
+  if (fliplr) {
+    const int16_t *const tmp = l;
+    l = r;
+    r = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(l, in, stride, flipud, fliplr);
+  scale_sqrt2_8x8_signed(in);
+  load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
+  scale_sqrt2_8x8_signed(in + 8);
+}
+
+void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
+                      int tx_type) {
+  __m128i in[16];
+
+  __m128i *const l = in;      // Alias to left 8x8 sub block
+  __m128i *const r = in + 8;  // Alias to right 8x8 sub block, which we store
+                              // in the second half of the array
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      fdct16_8col(in);
+      break;
+    case ADST_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fdct16_8col(in);
+      break;
+    case DCT_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      fadst16_8col(in);
+      break;
+    case ADST_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fadst16_8col(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x8(input, in, stride, 1, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fdct16_8col(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x8(input, in, stride, 0, 1);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      fadst16_8col(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x8(input, in, stride, 1, 1);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fadst16_8col(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x8(input, in, stride, 0, 1);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fadst16_8col(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x8(input, in, stride, 1, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fadst16_8col(in);
+      break;
+    case IDTX:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      fidtx16_8col(in);
+      break;
+    case V_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fdct8_sse2(l);
+      fdct8_sse2(r);
+      fidtx16_8col(in);
+      break;
+    case H_DCT:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      fdct16_8col(in);
+      break;
+    case V_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fidtx16_8col(in);
+      break;
+    case H_ADST:
+      load_buffer_16x8(input, in, stride, 0, 0);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      fadst16_8col(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x8(input, in, stride, 1, 0);
+      fadst8_sse2(l);
+      fadst8_sse2(r);
+      fidtx16_8col(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x8(input, in, stride, 0, 1);
+      fidtx8_sse2(l);
+      fidtx8_sse2(r);
+      fadst16_8col(in);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  array_transpose_8x8(l, l);
+  array_transpose_8x8(r, r);
+  right_shift_8x8(l, 2);
+  right_shift_8x8(r, 2);
+  write_buffer_8x8(output, l, 16);
+  write_buffer_8x8(output + 8, r, 16);
+}
+
+// Note: The 16-column 32-element transforms expect their input to be
+// split up into a 2x2 grid of 8x16 blocks
+static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                __m128i *br) {
+  fdct32_8col(tl, bl);
+  fdct32_8col(tr, br);
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+}
+
+static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                      __m128i *br) {
+  __m128i tmpl[16], tmpr[16];
+  int i;
+
+  // Copy the bottom half of the input to temporary storage
+  for (i = 0; i < 16; ++i) {
+    tmpl[i] = bl[i];
+    tmpr[i] = br[i];
+  }
+
+  // Generate the bottom half of the output
+  for (i = 0; i < 16; ++i) {
+    bl[i] = _mm_slli_epi16(tl[i], 2);
+    br[i] = _mm_slli_epi16(tr[i], 2);
+  }
+  array_transpose_16x16(bl, br);
+
+  // Copy the temporary storage back to the top half of the input
+  for (i = 0; i < 16; ++i) {
+    tl[i] = tmpl[i];
+    tr[i] = tmpr[i];
+  }
+
+  // Generate the top half of the output
+  scale_sqrt2_8x16(tl);
+  scale_sqrt2_8x16(tr);
+  fdct16_sse2(tl, tr);
+}
+
+#if CONFIG_EXT_TX
+static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
+                                 __m128i *br) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    tl[i] = _mm_slli_epi16(tl[i], 2);
+    tr[i] = _mm_slli_epi16(tr[i], 2);
+    bl[i] = _mm_slli_epi16(bl[i], 2);
+    br[i] = _mm_slli_epi16(br[i], 2);
+  }
+  array_transpose_16x16(tl, tr);
+  array_transpose_16x16(bl, br);
+}
+#endif
+
+static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
+                                     __m128i *intr, __m128i *inbl,
+                                     __m128i *inbr, int stride, int flipud,
+                                     int fliplr) {
+  int i;
+  if (flipud) {
+    input = input + 31 * stride;
+    stride = -stride;
+  }
+
+  for (i = 0; i < 16; ++i) {
+    intl[i + 0] = _mm_load_si128((const __m128i *)(input + i * stride + 0));
+    intr[i + 0] = _mm_load_si128((const __m128i *)(input + i * stride + 8));
+    inbl[i + 0] =
+        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0));
+    inbr[i + 0] =
+        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8));
+  }
+
+  if (fliplr) {
+    __m128i tmp;
+    for (i = 0; i < 16; ++i) {
+      tmp = intl[i];
+      intl[i] = mm_reverse_epi16(intr[i]);
+      intr[i] = mm_reverse_epi16(tmp);
+      tmp = inbl[i];
+      inbl[i] = mm_reverse_epi16(inbr[i]);
+      inbr[i] = mm_reverse_epi16(tmp);
+    }
+  }
+
+  scale_sqrt2_8x16(intl);
+  scale_sqrt2_8x16(intr);
+  scale_sqrt2_8x16(inbl);
+  scale_sqrt2_8x16(inbr);
+}
+
+static INLINE void right_shift_8x16(__m128i *res, const int bit) {
+  right_shift_8x8(res, bit);
+  right_shift_8x8(res + 8, bit);
+}
+
+static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
+                                      __m128i *restr, __m128i *resbl,
+                                      __m128i *resbr) {
+  int i;
+  right_shift_8x16(restl, 2);
+  right_shift_8x16(restr, 2);
+  right_shift_8x16(resbl, 2);
+  right_shift_8x16(resbr, 2);
+  for (i = 0; i < 16; ++i) {
+    store_output(&restl[i], output + i * 16 + 0);
+    store_output(&restr[i], output + i * 16 + 8);
+    store_output(&resbl[i], output + (i + 16) * 16 + 0);
+    store_output(&resbr[i], output + (i + 16) * 16 + 8);
+  }
+}
+
+// Note on data layout, for both this and the 32x16 transforms:
+// So that we can reuse the 16-element transforms easily,
+// we want to split the input into 8x16 blocks.
+// For 16x32, this means the input is a 2x2 grid of such blocks.
+// For 32x16, it means the input is a 4x1 grid.
+void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m128i intl[16], intr[16], inbl[16], inbr[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fdct32_16col(intl, intr, inbl, inbr);
+      fdct16_sse2(intl, intr);
+      fdct16_sse2(inbl, inbr);
+      break;
+    case ADST_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fdct16_sse2(intl, intr);
+      fdct16_sse2(inbl, inbr);
+      break;
+    case DCT_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fdct32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+    case ADST_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fdct16_sse2(intl, intr);
+      fdct16_sse2(inbl, inbr);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+      fdct32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+    case IDTX:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      break;
+    case V_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fdct32_16col(intl, intr, inbl, inbr);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      break;
+    case H_DCT:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      fdct16_sse2(intl, intr);
+      fdct16_sse2(inbl, inbr);
+      break;
+    case V_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      break;
+    case H_ADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
+      fhalfright32_16col(intl, intr, inbl, inbr);
+      fidtx16_sse2(intl, intr);
+      fidtx16_sse2(inbl, inbr);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
+      fidtx32_16col(intl, intr, inbl, inbr);
+      fadst16_sse2(intl, intr);
+      fadst16_sse2(inbl, inbr);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_16x32(output, intl, intr, inbl, inbr);
+}
+
+static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
+                                     __m128i *in1, __m128i *in2, __m128i *in3,
+                                     int stride, int flipud, int fliplr) {
+  int i;
+  if (flipud) {
+    input += 15 * stride;
+    stride = -stride;
+  }
+
+  for (i = 0; i < 16; ++i) {
+    in0[i] = _mm_load_si128((const __m128i *)(input + i * stride + 0));
+    in1[i] = _mm_load_si128((const __m128i *)(input + i * stride + 8));
+    in2[i] = _mm_load_si128((const __m128i *)(input + i * stride + 16));
+    in3[i] = _mm_load_si128((const __m128i *)(input + i * stride + 24));
+  }
+
+  if (fliplr) {
+    for (i = 0; i < 16; ++i) {
+      __m128i tmp1 = in0[i];
+      __m128i tmp2 = in1[i];
+      in0[i] = mm_reverse_epi16(in3[i]);
+      in1[i] = mm_reverse_epi16(in2[i]);
+      in2[i] = mm_reverse_epi16(tmp2);
+      in3[i] = mm_reverse_epi16(tmp1);
+    }
+  }
+
+  scale_sqrt2_8x16(in0);
+  scale_sqrt2_8x16(in1);
+  scale_sqrt2_8x16(in2);
+  scale_sqrt2_8x16(in3);
+}
+
+static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
+                                      __m128i *res1, __m128i *res2,
+                                      __m128i *res3) {
+  int i;
+  right_shift_8x16(res0, 2);
+  right_shift_8x16(res1, 2);
+  right_shift_8x16(res2, 2);
+  right_shift_8x16(res3, 2);
+  for (i = 0; i < 16; ++i) {
+    store_output(&res0[i], output + i * 32 + 0);
+    store_output(&res1[i], output + i * 32 + 8);
+    store_output(&res2[i], output + i * 32 + 16);
+    store_output(&res3[i], output + i * 32 + 24);
+  }
+}
+
+void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m128i in0[16], in1[16], in2[16], in3[16];
+
+  load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+  switch (tx_type) {
+    case DCT_DCT:
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case ADST_DCT:
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case DCT_ADST:
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+    case ADST_ADST:
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+    case IDTX:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case V_DCT:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      fdct16_sse2(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case H_DCT:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      fdct32_16col(in0, in1, in2, in3);
+      break;
+    case V_ADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case H_ADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+    case V_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      fadst16_sse2(in2, in3);
+      fidtx32_16col(in0, in1, in2, in3);
+      break;
+    case H_FLIPADST:
+      load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
+      fidtx16_sse2(in0, in1);
+      fidtx16_sse2(in2, in3);
+      fhalfright32_16col(in0, in1, in2, in3);
+      break;
+#endif
+    default: assert(0); break;
+  }
+  write_buffer_32x16(output, in0, in1, in2, in3);
+}

diff --git a/av1/encoder/x86/dct_ssse3.c b/av1/encoder/x86/dct_ssse3.c
index 8cef227..717a99a 100644
--- a/av1/encoder/x86/dct_ssse3.c
+++ b/av1/encoder/x86/dct_ssse3.c

@@ -10,6 +10,11 @@
  */
 
 #include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
 #include <tmmintrin.h>  // SSSE3
 
 #include "./av1_rtcd.h"

diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index 0d2e828..ae733a1 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c

@@ -68,5 +68,6 @@
   _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
 
   _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+  _mm256_zeroupper();
   return sse;
 }

diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 0000000..d601208
--- /dev/null
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c

@@ -0,0 +1,1895 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "av1/common/av1_fwd_txfm2d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[2]);
+  in[3] = _mm_cvtepi16_epi32(in[3]);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  s0 = _mm_add_epi32(in[0], in[3]);
+  s1 = _mm_add_epi32(in[1], in[2]);
+  s2 = _mm_sub_epi32(in[1], in[2]);
+  s3 = _mm_sub_epi32(in[0], in[3]);
+
+  // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+  u0 = _mm_mullo_epi32(s0, cospi32);
+  u1 = _mm_mullo_epi32(s1, cospi32);
+  u2 = _mm_add_epi32(u0, u1);
+  v0 = _mm_sub_epi32(u0, u1);
+
+  u3 = _mm_add_epi32(u2, rnding);
+  v1 = _mm_add_epi32(v0, rnding);
+
+  u0 = _mm_srai_epi32(u3, bit);
+  u2 = _mm_srai_epi32(v1, bit);
+
+  // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+  v0 = _mm_mullo_epi32(s2, cospi48);
+  v1 = _mm_mullo_epi32(s3, cospi16);
+  v2 = _mm_add_epi32(v0, v1);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u1 = _mm_srai_epi32(v3, bit);
+
+  v0 = _mm_mullo_epi32(s2, cospi16);
+  v1 = _mm_mullo_epi32(s3, cospi48);
+  v2 = _mm_sub_epi32(v1, v0);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(v3, bit);
+
+  // Note: shift[1] and shift[2] are zeros
+
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+// Note:
+//  We implement av1_fwd_txfm2d_4x4(). This function is kept here since
+//  av1_highbd_fht4x4_c() is not removed yet
+void av1_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
+                              int stride, int tx_type) {
+  (void)input;
+  (void)output;
+  (void)stride;
+  (void)tx_type;
+  assert(0);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = _mm_mullo_epi32(in[3], cospi8);
+  u1 = _mm_mullo_epi32(in[0], cospi56);
+  u2 = _mm_add_epi32(u0, u1);
+  s0 = _mm_add_epi32(u2, rnding);
+  s0 = _mm_srai_epi32(s0, bit);
+
+  v0 = _mm_mullo_epi32(in[3], cospi56);
+  v1 = _mm_mullo_epi32(in[0], cospi8);
+  v2 = _mm_sub_epi32(v0, v1);
+  s1 = _mm_add_epi32(v2, rnding);
+  s1 = _mm_srai_epi32(s1, bit);
+
+  u0 = _mm_mullo_epi32(in[1], cospi40);
+  u1 = _mm_mullo_epi32(in[2], cospi24);
+  u2 = _mm_add_epi32(u0, u1);
+  s2 = _mm_add_epi32(u2, rnding);
+  s2 = _mm_srai_epi32(s2, bit);
+
+  v0 = _mm_mullo_epi32(in[1], cospi24);
+  v1 = _mm_mullo_epi32(in[2], cospi40);
+  v2 = _mm_sub_epi32(v0, v1);
+  s3 = _mm_add_epi32(v2, rnding);
+  s3 = _mm_srai_epi32(s3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(s0, s2);
+  u2 = _mm_sub_epi32(s0, s2);
+  u1 = _mm_add_epi32(s1, s3);
+  u3 = _mm_sub_epi32(s1, s3);
+
+  // stage 4
+  v0 = _mm_mullo_epi32(u2, cospi32);
+  v1 = _mm_mullo_epi32(u3, cospi32);
+  v2 = _mm_add_epi32(v0, v1);
+  s2 = _mm_add_epi32(v2, rnding);
+  u2 = _mm_srai_epi32(s2, bit);
+
+  v2 = _mm_sub_epi32(v0, v1);
+  s3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(s3, bit);
+
+  // u0, u1, u2, u3
+  u2 = _mm_sub_epi32(kZero, u2);
+  u1 = _mm_sub_epi32(kZero, u1);
+
+  // u0, u2, u3, u1
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u2);
+  v1 = _mm_unpackhi_epi32(u0, u2);
+  v2 = _mm_unpacklo_epi32(u3, u1);
+  v3 = _mm_unpackhi_epi32(u3, u1);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+                               int input_stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#endif
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  __m128i u;
+  if (!flipud) {
+    in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
+
+  u = _mm_unpackhi_epi64(in[4], in[4]);
+  in[8] = _mm_cvtepi16_epi32(in[4]);
+  in[9] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[5], in[5]);
+  in[10] = _mm_cvtepi16_epi32(in[5]);
+  in[11] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[6], in[6]);
+  in[12] = _mm_cvtepi16_epi32(in[6]);
+  in[13] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[7], in[7]);
+  in[14] = _mm_cvtepi16_epi32(in[7]);
+  in[15] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[3], in[3]);
+  in[6] = _mm_cvtepi16_epi32(in[3]);
+  in[7] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[2], in[2]);
+  in[4] = _mm_cvtepi16_epi32(in[2]);
+  in[5] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[1], in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[1]);
+  in[3] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[0], in[0]);
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(u);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+  in[4] = _mm_slli_epi32(in[4], shift);
+  in[5] = _mm_slli_epi32(in[5], shift);
+  in[6] = _mm_slli_epi32(in[6], shift);
+  in[7] = _mm_slli_epi32(in[7], shift);
+
+  in[8] = _mm_slli_epi32(in[8], shift);
+  in[9] = _mm_slli_epi32(in[9], shift);
+  in[10] = _mm_slli_epi32(in[10], shift);
+  in[11] = _mm_slli_epi32(in[11], shift);
+  in[12] = _mm_slli_epi32(in[12], shift);
+  in[13] = _mm_slli_epi32(in[13], shift);
+  in[14] = _mm_slli_epi32(in[14], shift);
+  in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+  in[8] = _mm_add_epi32(in[8], rounding);
+  in[9] = _mm_add_epi32(in[9], rounding);
+  in[10] = _mm_add_epi32(in[10], rounding);
+  in[11] = _mm_add_epi32(in[11], rounding);
+  in[12] = _mm_add_epi32(in[12], rounding);
+  in[13] = _mm_add_epi32(in[13], rounding);
+  in[14] = _mm_add_epi32(in[14], rounding);
+  in[15] = _mm_add_epi32(in[15], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+  in[8] = _mm_srai_epi32(in[8], shift);
+  in[9] = _mm_srai_epi32(in[9], shift);
+  in[10] = _mm_srai_epi32(in[10], shift);
+  in[11] = _mm_srai_epi32(in[11], shift);
+  in[12] = _mm_srai_epi32(in[12], shift);
+  in[13] = _mm_srai_epi32(in[13], shift);
+  in[14] = _mm_srai_epi32(in[14], shift);
+  in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+  _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+  _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+  _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+  _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+  _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+  _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+  _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+  _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+  _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[8], v[8];
+
+  // Even 8 points 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[0], in[14]);
+  v[7] = _mm_sub_epi32(in[0], in[14]);  // v[7]
+  u[1] = _mm_add_epi32(in[2], in[12]);
+  u[6] = _mm_sub_epi32(in[2], in[12]);
+  u[2] = _mm_add_epi32(in[4], in[10]);
+  u[5] = _mm_sub_epi32(in[4], in[10]);
+  u[3] = _mm_add_epi32(in[6], in[8]);
+  v[4] = _mm_sub_epi32(in[6], in[8]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[2] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[14] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[10] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[6] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[0] = u[0];   // buf0[0]
+  out[8] = u[1];   // buf0[1]
+  out[4] = u[2];   // buf0[2]
+  out[12] = u[3];  // buf0[3]
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[1], in[15]);
+  v[7] = _mm_sub_epi32(in[1], in[15]);  // v[7]
+  u[1] = _mm_add_epi32(in[3], in[13]);
+  u[6] = _mm_sub_epi32(in[3], in[13]);
+  u[2] = _mm_add_epi32(in[5], in[11]);
+  u[5] = _mm_sub_epi32(in[5], in[11]);
+  u[3] = _mm_add_epi32(in[7], in[9]);
+  v[4] = _mm_sub_epi32(in[7], in[9]);  // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[3] = _mm_srai_epi32(v[0], bit);  // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[15] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[11] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[7] = _mm_srai_epi32(v[0], bit);  // buf0[6]
+
+  out[1] = u[0];   // buf0[0]
+  out[9] = u[1];   // buf0[1]
+  out[5] = u[2];   // buf0[2]
+  out[13] = u[3];  // buf0[3]
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i u[8], v[8], x;
+
+  // Even 8 points: 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[14], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[14], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[10], cospi20);
+  x = _mm_mullo_epi32(in[4], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[10], cospi44);
+  x = _mm_mullo_epi32(in[4], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[6], cospi36);
+  x = _mm_mullo_epi32(in[8], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[6], cospi28);
+  x = _mm_mullo_epi32(in[8], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[2], cospi52);
+  x = _mm_mullo_epi32(in[12], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[2], cospi12);
+  x = _mm_mullo_epi32(in[12], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  v[0] = _mm_add_epi32(u[0], u[4]);
+  v[4] = _mm_sub_epi32(u[0], u[4]);
+  v[1] = _mm_add_epi32(u[1], u[5]);
+  v[5] = _mm_sub_epi32(u[1], u[5]);
+  v[2] = _mm_add_epi32(u[2], u[6]);
+  v[6] = _mm_sub_epi32(u[2], u[6]);
+  v[3] = _mm_add_epi32(u[3], u[7]);
+  v[7] = _mm_sub_epi32(u[3], u[7]);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  v[0] = _mm_add_epi32(u[0], u[2]);
+  v[2] = _mm_sub_epi32(u[0], u[2]);
+  v[1] = _mm_add_epi32(u[1], u[3]);
+  v[3] = _mm_sub_epi32(u[1], u[3]);
+  v[4] = _mm_add_epi32(u[4], u[6]);
+  v[6] = _mm_sub_epi32(u[4], u[6]);
+  v[5] = _mm_add_epi32(u[5], u[7]);
+  v[7] = _mm_sub_epi32(u[5], u[7]);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  out[0] = u[0];
+  out[2] = _mm_sub_epi32(kZero, u[4]);
+  out[4] = u[6];
+  out[6] = _mm_sub_epi32(kZero, u[2]);
+  out[8] = u[3];
+  out[10] = _mm_sub_epi32(kZero, u[7]);
+  out[12] = u[5];
+  out[14] = _mm_sub_epi32(kZero, u[1]);
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[15], cospi4);
+  x = _mm_mullo_epi32(in[1], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[15], cospi60);
+  x = _mm_mullo_epi32(in[1], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[11], cospi20);
+  x = _mm_mullo_epi32(in[5], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[11], cospi44);
+  x = _mm_mullo_epi32(in[5], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[7], cospi36);
+  x = _mm_mullo_epi32(in[9], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[7], cospi28);
+  x = _mm_mullo_epi32(in[9], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[3], cospi52);
+  x = _mm_mullo_epi32(in[13], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[3], cospi12);
+  x = _mm_mullo_epi32(in[13], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  v[0] = _mm_add_epi32(u[0], u[4]);
+  v[4] = _mm_sub_epi32(u[0], u[4]);
+  v[1] = _mm_add_epi32(u[1], u[5]);
+  v[5] = _mm_sub_epi32(u[1], u[5]);
+  v[2] = _mm_add_epi32(u[2], u[6]);
+  v[6] = _mm_sub_epi32(u[2], u[6]);
+  v[3] = _mm_add_epi32(u[3], u[7]);
+  v[7] = _mm_sub_epi32(u[3], u[7]);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  v[0] = _mm_add_epi32(u[0], u[2]);
+  v[2] = _mm_sub_epi32(u[0], u[2]);
+  v[1] = _mm_add_epi32(u[1], u[3]);
+  v[3] = _mm_sub_epi32(u[1], u[3]);
+  v[4] = _mm_add_epi32(u[4], u[6]);
+  v[6] = _mm_sub_epi32(u[4], u[6]);
+  v[5] = _mm_add_epi32(u[5], u[7]);
+  v[7] = _mm_sub_epi32(u[5], u[7]);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  out[1] = u[0];
+  out[3] = _mm_sub_epi32(kZero, u[4]);
+  out[5] = u[6];
+  out[7] = _mm_sub_epi32(kZero, u[2]);
+  out[9] = u[3];
+  out[11] = _mm_sub_epi32(kZero, u[7]);
+  out[13] = u[5];
+  out[15] = _mm_sub_epi32(kZero, u[1]);
+}
+
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+                               int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  __m128i in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x;
+  const int col_num = 4;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm_mullo_epi32(u[10], cospim32);
+    x = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[13], cospim32);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospim32);
+    x = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi32);
+    x = _mm_mullo_epi32(u[12], cospim32);
+    v[12] = _mm_sub_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm_mullo_epi32(v[5], cospim32);
+    x = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi32);
+    x = _mm_mullo_epi32(v[6], cospim32);
+    u[6] = _mm_sub_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[15] = _mm_add_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm_mullo_epi32(u[0], cospi32);
+    u[1] = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(u[0], u[1]);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(u[0], u[1]);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(u[2], cospi48);
+    x = _mm_mullo_epi32(u[3], cospi16);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(u[2], cospi16);
+    x = _mm_mullo_epi32(u[3], cospi48);
+    v[3] = _mm_sub_epi32(x, v[3]);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm_mullo_epi32(u[9], cospim16);
+    x = _mm_mullo_epi32(u[14], cospi48);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi48);
+    x = _mm_mullo_epi32(u[14], cospim16);
+    v[14] = _mm_sub_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospim48);
+    x = _mm_mullo_epi32(u[13], cospim16);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospim16);
+    x = _mm_mullo_epi32(u[13], cospim48);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm_mullo_epi32(v[4], cospi56);
+    x = _mm_mullo_epi32(v[7], cospi8);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    u[7] = _mm_mullo_epi32(v[4], cospi8);
+    x = _mm_mullo_epi32(v[7], cospi56);
+    u[7] = _mm_sub_epi32(x, u[7]);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    u[5] = _mm_mullo_epi32(v[5], cospi24);
+    x = _mm_mullo_epi32(v[6], cospi40);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi40);
+    x = _mm_mullo_epi32(v[6], cospi24);
+    u[6] = _mm_sub_epi32(x, u[6]);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[11], v[10]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi60);
+    x = _mm_mullo_epi32(u[15], cospi4);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[15] = _mm_mullo_epi32(u[8], cospi4);
+    x = _mm_mullo_epi32(u[15], cospi60);
+    v[15] = _mm_sub_epi32(x, v[15]);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    v[9] = _mm_mullo_epi32(u[9], cospi28);
+    x = _mm_mullo_epi32(u[14], cospi36);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi36);
+    x = _mm_mullo_epi32(u[14], cospi28);
+    v[14] = _mm_sub_epi32(x, v[14]);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi44);
+    x = _mm_mullo_epi32(u[13], cospi20);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi20);
+    x = _mm_mullo_epi32(u[13], cospi44);
+    v[13] = _mm_sub_epi32(x, v[13]);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospi12);
+    x = _mm_mullo_epi32(u[12], cospi52);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi52);
+    x = _mm_mullo_epi32(u[12], cospi12);
+    v[12] = _mm_sub_epi32(x, v[12]);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  const int col_num = 4;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 9
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
+    out[2 * col_num + col] = v[12];
+    out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
+    out[4 * col_num + col] = v[6];
+    out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
+    out[6 * col_num + col] = v[10];
+    out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
+    out[8 * col_num + col] = v[3];
+    out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
+    out[10 * col_num + col] = v[15];
+    out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
+    out[12 * col_num + col] = v[5];
+    out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
+    out[14 * col_num + col] = v[9];
+    out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
+  }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+  col_txfm_8x8_rounding(&in[32], shift);
+  col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                 int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0);
+  }
+  (void)bd;
+}

diff --git a/av1/encoder/x86/hybrid_fwd_txfm_avx2.c b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c
new file mode 100644
index 0000000..77ae724
--- /dev/null
+++ b/av1/encoder/x86/hybrid_fwd_txfm_avx2.c

@@ -0,0 +1,1706 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // avx2
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/fwd_txfm_avx2.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static int32_t get_16x16_sum(const int16_t *input, int stride) {
+  __m256i r0, r1, r2, r3, u0, u1;
+  __m256i zero = _mm256_setzero_si256();
+  __m256i sum = _mm256_setzero_si256();
+  const int16_t *blockBound = input + (stride << 4);
+  __m128i v0, v1;
+
+  while (input < blockBound) {
+    r0 = _mm256_loadu_si256((__m256i const *)input);
+    r1 = _mm256_loadu_si256((__m256i const *)(input + stride));
+    r2 = _mm256_loadu_si256((__m256i const *)(input + 2 * stride));
+    r3 = _mm256_loadu_si256((__m256i const *)(input + 3 * stride));
+
+    u0 = _mm256_add_epi16(r0, r1);
+    u1 = _mm256_add_epi16(r2, r3);
+    sum = _mm256_add_epi16(sum, u0);
+    sum = _mm256_add_epi16(sum, u1);
+
+    input += stride << 2;
+  }
+
+  // unpack 16 int16_t into 2x8 int32_t
+  u0 = _mm256_unpacklo_epi16(zero, sum);
+  u1 = _mm256_unpackhi_epi16(zero, sum);
+  u0 = _mm256_srai_epi32(u0, 16);
+  u1 = _mm256_srai_epi32(u1, 16);
+  sum = _mm256_add_epi32(u0, u1);
+
+  u0 = _mm256_srli_si256(sum, 8);
+  u1 = _mm256_add_epi32(sum, u0);
+
+  v0 = _mm_add_epi32(_mm256_extracti128_si256(u1, 1),
+                     _mm256_castsi256_si128(u1));
+  v1 = _mm_srli_si128(v0, 4);
+  v0 = _mm_add_epi32(v0, v1);
+  return (int32_t)_mm_extract_epi32(v0, 0);
+}
+
+void aom_fdct16x16_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int32_t dc = get_16x16_sum(input, stride);
+  output[0] = (tran_low_t)(dc >> 1);
+  _mm256_zeroupper();
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int stride,
+                                     int flipud, int fliplr, __m256i *in) {
+  if (!flipud) {
+    in[0] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
+    in[1] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
+    in[2] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
+    in[3] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
+    in[4] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
+    in[5] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
+    in[6] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
+    in[7] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
+    in[8] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
+    in[9] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
+    in[10] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
+    in[11] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
+    in[12] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
+    in[13] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
+    in[14] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
+    in[15] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
+  } else {
+    in[0] = _mm256_loadu_si256((const __m256i *)(input + 15 * stride));
+    in[1] = _mm256_loadu_si256((const __m256i *)(input + 14 * stride));
+    in[2] = _mm256_loadu_si256((const __m256i *)(input + 13 * stride));
+    in[3] = _mm256_loadu_si256((const __m256i *)(input + 12 * stride));
+    in[4] = _mm256_loadu_si256((const __m256i *)(input + 11 * stride));
+    in[5] = _mm256_loadu_si256((const __m256i *)(input + 10 * stride));
+    in[6] = _mm256_loadu_si256((const __m256i *)(input + 9 * stride));
+    in[7] = _mm256_loadu_si256((const __m256i *)(input + 8 * stride));
+    in[8] = _mm256_loadu_si256((const __m256i *)(input + 7 * stride));
+    in[9] = _mm256_loadu_si256((const __m256i *)(input + 6 * stride));
+    in[10] = _mm256_loadu_si256((const __m256i *)(input + 5 * stride));
+    in[11] = _mm256_loadu_si256((const __m256i *)(input + 4 * stride));
+    in[12] = _mm256_loadu_si256((const __m256i *)(input + 3 * stride));
+    in[13] = _mm256_loadu_si256((const __m256i *)(input + 2 * stride));
+    in[14] = _mm256_loadu_si256((const __m256i *)(input + 1 * stride));
+    in[15] = _mm256_loadu_si256((const __m256i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    mm256_reverse_epi16(&in[0]);
+    mm256_reverse_epi16(&in[1]);
+    mm256_reverse_epi16(&in[2]);
+    mm256_reverse_epi16(&in[3]);
+    mm256_reverse_epi16(&in[4]);
+    mm256_reverse_epi16(&in[5]);
+    mm256_reverse_epi16(&in[6]);
+    mm256_reverse_epi16(&in[7]);
+    mm256_reverse_epi16(&in[8]);
+    mm256_reverse_epi16(&in[9]);
+    mm256_reverse_epi16(&in[10]);
+    mm256_reverse_epi16(&in[11]);
+    mm256_reverse_epi16(&in[12]);
+    mm256_reverse_epi16(&in[13]);
+    mm256_reverse_epi16(&in[14]);
+    mm256_reverse_epi16(&in[15]);
+  }
+
+  in[0] = _mm256_slli_epi16(in[0], 2);
+  in[1] = _mm256_slli_epi16(in[1], 2);
+  in[2] = _mm256_slli_epi16(in[2], 2);
+  in[3] = _mm256_slli_epi16(in[3], 2);
+  in[4] = _mm256_slli_epi16(in[4], 2);
+  in[5] = _mm256_slli_epi16(in[5], 2);
+  in[6] = _mm256_slli_epi16(in[6], 2);
+  in[7] = _mm256_slli_epi16(in[7], 2);
+  in[8] = _mm256_slli_epi16(in[8], 2);
+  in[9] = _mm256_slli_epi16(in[9], 2);
+  in[10] = _mm256_slli_epi16(in[10], 2);
+  in[11] = _mm256_slli_epi16(in[11], 2);
+  in[12] = _mm256_slli_epi16(in[12], 2);
+  in[13] = _mm256_slli_epi16(in[13], 2);
+  in[14] = _mm256_slli_epi16(in[14], 2);
+  in[15] = _mm256_slli_epi16(in[15], 2);
+}
+
+static INLINE void write_buffer_16x16(const __m256i *in, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i) {
+    storeu_output_avx2(&in[i], output + (i << 4));
+  }
+}
+
+static void right_shift_16x16(__m256i *in) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i s0 = _mm256_srai_epi16(in[0], 15);
+  __m256i s1 = _mm256_srai_epi16(in[1], 15);
+  __m256i s2 = _mm256_srai_epi16(in[2], 15);
+  __m256i s3 = _mm256_srai_epi16(in[3], 15);
+  __m256i s4 = _mm256_srai_epi16(in[4], 15);
+  __m256i s5 = _mm256_srai_epi16(in[5], 15);
+  __m256i s6 = _mm256_srai_epi16(in[6], 15);
+  __m256i s7 = _mm256_srai_epi16(in[7], 15);
+  __m256i s8 = _mm256_srai_epi16(in[8], 15);
+  __m256i s9 = _mm256_srai_epi16(in[9], 15);
+  __m256i s10 = _mm256_srai_epi16(in[10], 15);
+  __m256i s11 = _mm256_srai_epi16(in[11], 15);
+  __m256i s12 = _mm256_srai_epi16(in[12], 15);
+  __m256i s13 = _mm256_srai_epi16(in[13], 15);
+  __m256i s14 = _mm256_srai_epi16(in[14], 15);
+  __m256i s15 = _mm256_srai_epi16(in[15], 15);
+
+  in[0] = _mm256_add_epi16(in[0], one);
+  in[1] = _mm256_add_epi16(in[1], one);
+  in[2] = _mm256_add_epi16(in[2], one);
+  in[3] = _mm256_add_epi16(in[3], one);
+  in[4] = _mm256_add_epi16(in[4], one);
+  in[5] = _mm256_add_epi16(in[5], one);
+  in[6] = _mm256_add_epi16(in[6], one);
+  in[7] = _mm256_add_epi16(in[7], one);
+  in[8] = _mm256_add_epi16(in[8], one);
+  in[9] = _mm256_add_epi16(in[9], one);
+  in[10] = _mm256_add_epi16(in[10], one);
+  in[11] = _mm256_add_epi16(in[11], one);
+  in[12] = _mm256_add_epi16(in[12], one);
+  in[13] = _mm256_add_epi16(in[13], one);
+  in[14] = _mm256_add_epi16(in[14], one);
+  in[15] = _mm256_add_epi16(in[15], one);
+
+  in[0] = _mm256_sub_epi16(in[0], s0);
+  in[1] = _mm256_sub_epi16(in[1], s1);
+  in[2] = _mm256_sub_epi16(in[2], s2);
+  in[3] = _mm256_sub_epi16(in[3], s3);
+  in[4] = _mm256_sub_epi16(in[4], s4);
+  in[5] = _mm256_sub_epi16(in[5], s5);
+  in[6] = _mm256_sub_epi16(in[6], s6);
+  in[7] = _mm256_sub_epi16(in[7], s7);
+  in[8] = _mm256_sub_epi16(in[8], s8);
+  in[9] = _mm256_sub_epi16(in[9], s9);
+  in[10] = _mm256_sub_epi16(in[10], s10);
+  in[11] = _mm256_sub_epi16(in[11], s11);
+  in[12] = _mm256_sub_epi16(in[12], s12);
+  in[13] = _mm256_sub_epi16(in[13], s13);
+  in[14] = _mm256_sub_epi16(in[14], s14);
+  in[15] = _mm256_sub_epi16(in[15], s15);
+
+  in[0] = _mm256_srai_epi16(in[0], 2);
+  in[1] = _mm256_srai_epi16(in[1], 2);
+  in[2] = _mm256_srai_epi16(in[2], 2);
+  in[3] = _mm256_srai_epi16(in[3], 2);
+  in[4] = _mm256_srai_epi16(in[4], 2);
+  in[5] = _mm256_srai_epi16(in[5], 2);
+  in[6] = _mm256_srai_epi16(in[6], 2);
+  in[7] = _mm256_srai_epi16(in[7], 2);
+  in[8] = _mm256_srai_epi16(in[8], 2);
+  in[9] = _mm256_srai_epi16(in[9], 2);
+  in[10] = _mm256_srai_epi16(in[10], 2);
+  in[11] = _mm256_srai_epi16(in[11], 2);
+  in[12] = _mm256_srai_epi16(in[12], 2);
+  in[13] = _mm256_srai_epi16(in[13], 2);
+  in[14] = _mm256_srai_epi16(in[14], 2);
+  in[15] = _mm256_srai_epi16(in[15], 2);
+}
+
+static void fdct16_avx2(__m256i *in) {
+  // sequence: cospi_L_H = pairs(L, H) and L first
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+
+  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+
+  const __m256i cospi_p30_p02 = pair256_set_epi16(cospi_30_64, cospi_2_64);
+  const __m256i cospi_m02_p30 = pair256_set_epi16(-cospi_2_64, cospi_30_64);
+
+  const __m256i cospi_p14_p18 = pair256_set_epi16(cospi_14_64, cospi_18_64);
+  const __m256i cospi_m18_p14 = pair256_set_epi16(-cospi_18_64, cospi_14_64);
+
+  const __m256i cospi_p22_p10 = pair256_set_epi16(cospi_22_64, cospi_10_64);
+  const __m256i cospi_m10_p22 = pair256_set_epi16(-cospi_10_64, cospi_22_64);
+
+  const __m256i cospi_p06_p26 = pair256_set_epi16(cospi_6_64, cospi_26_64);
+  const __m256i cospi_m26_p06 = pair256_set_epi16(-cospi_26_64, cospi_6_64);
+
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m256i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m256i t0, t1, t2, t3, t4, t5, t6, t7;
+  __m256i v0, v1, v2, v3;
+  __m256i x0, x1;
+
+  // 0, 4, 8, 12
+  u0 = _mm256_add_epi16(in[0], in[15]);
+  u1 = _mm256_add_epi16(in[1], in[14]);
+  u2 = _mm256_add_epi16(in[2], in[13]);
+  u3 = _mm256_add_epi16(in[3], in[12]);
+  u4 = _mm256_add_epi16(in[4], in[11]);
+  u5 = _mm256_add_epi16(in[5], in[10]);
+  u6 = _mm256_add_epi16(in[6], in[9]);
+  u7 = _mm256_add_epi16(in[7], in[8]);
+
+  s0 = _mm256_add_epi16(u0, u7);
+  s1 = _mm256_add_epi16(u1, u6);
+  s2 = _mm256_add_epi16(u2, u5);
+  s3 = _mm256_add_epi16(u3, u4);
+
+  // 0, 8
+  v0 = _mm256_add_epi16(s0, s3);
+  v1 = _mm256_add_epi16(s1, s2);
+
+  x0 = _mm256_unpacklo_epi16(v0, v1);
+  x1 = _mm256_unpackhi_epi16(v0, v1);
+
+  t0 = butter_fly(x0, x1, cospi_p16_p16);
+  t1 = butter_fly(x0, x1, cospi_p16_m16);
+
+  // 4, 12
+  v0 = _mm256_sub_epi16(s1, s2);
+  v1 = _mm256_sub_epi16(s0, s3);
+
+  x0 = _mm256_unpacklo_epi16(v0, v1);
+  x1 = _mm256_unpackhi_epi16(v0, v1);
+
+  t2 = butter_fly(x0, x1, cospi_p24_p08);
+  t3 = butter_fly(x0, x1, cospi_m08_p24);
+
+  // 2, 6, 10, 14
+  s0 = _mm256_sub_epi16(u3, u4);
+  s1 = _mm256_sub_epi16(u2, u5);
+  s2 = _mm256_sub_epi16(u1, u6);
+  s3 = _mm256_sub_epi16(u0, u7);
+
+  v0 = s0;  // output[4]
+  v3 = s3;  // output[7]
+
+  x0 = _mm256_unpacklo_epi16(s2, s1);
+  x1 = _mm256_unpackhi_epi16(s2, s1);
+
+  v2 = butter_fly(x0, x1, cospi_p16_p16);  // output[5]
+  v1 = butter_fly(x0, x1, cospi_p16_m16);  // output[6]
+
+  s0 = _mm256_add_epi16(v0, v1);  // step[4]
+  s1 = _mm256_sub_epi16(v0, v1);  // step[5]
+  s2 = _mm256_sub_epi16(v3, v2);  // step[6]
+  s3 = _mm256_add_epi16(v3, v2);  // step[7]
+
+  // 2, 14
+  x0 = _mm256_unpacklo_epi16(s0, s3);
+  x1 = _mm256_unpackhi_epi16(s0, s3);
+
+  t4 = butter_fly(x0, x1, cospi_p28_p04);
+  t5 = butter_fly(x0, x1, cospi_m04_p28);
+
+  // 10, 6
+  x0 = _mm256_unpacklo_epi16(s1, s2);
+  x1 = _mm256_unpackhi_epi16(s1, s2);
+  t6 = butter_fly(x0, x1, cospi_p12_p20);
+  t7 = butter_fly(x0, x1, cospi_m20_p12);
+
+  // 1, 3, 5, 7, 9, 11, 13, 15
+  s0 = _mm256_sub_epi16(in[7], in[8]);  // step[8]
+  s1 = _mm256_sub_epi16(in[6], in[9]);  // step[9]
+  u2 = _mm256_sub_epi16(in[5], in[10]);
+  u3 = _mm256_sub_epi16(in[4], in[11]);
+  u4 = _mm256_sub_epi16(in[3], in[12]);
+  u5 = _mm256_sub_epi16(in[2], in[13]);
+  s6 = _mm256_sub_epi16(in[1], in[14]);  // step[14]
+  s7 = _mm256_sub_epi16(in[0], in[15]);  // step[15]
+
+  in[0] = t0;
+  in[8] = t1;
+  in[4] = t2;
+  in[12] = t3;
+  in[2] = t4;
+  in[14] = t5;
+  in[10] = t6;
+  in[6] = t7;
+
+  x0 = _mm256_unpacklo_epi16(u5, u2);
+  x1 = _mm256_unpackhi_epi16(u5, u2);
+
+  s2 = butter_fly(x0, x1, cospi_p16_p16);  // step[13]
+  s5 = butter_fly(x0, x1, cospi_p16_m16);  // step[10]
+
+  x0 = _mm256_unpacklo_epi16(u4, u3);
+  x1 = _mm256_unpackhi_epi16(u4, u3);
+
+  s3 = butter_fly(x0, x1, cospi_p16_p16);  // step[12]
+  s4 = butter_fly(x0, x1, cospi_p16_m16);  // step[11]
+
+  u0 = _mm256_add_epi16(s0, s4);  // output[8]
+  u1 = _mm256_add_epi16(s1, s5);
+  u2 = _mm256_sub_epi16(s1, s5);
+  u3 = _mm256_sub_epi16(s0, s4);
+  u4 = _mm256_sub_epi16(s7, s3);
+  u5 = _mm256_sub_epi16(s6, s2);
+  u6 = _mm256_add_epi16(s6, s2);
+  u7 = _mm256_add_epi16(s7, s3);
+
+  // stage 4
+  s0 = u0;
+  s3 = u3;
+  s4 = u4;
+  s7 = u7;
+
+  x0 = _mm256_unpacklo_epi16(u1, u6);
+  x1 = _mm256_unpackhi_epi16(u1, u6);
+
+  s1 = butter_fly(x0, x1, cospi_m08_p24);
+  s6 = butter_fly(x0, x1, cospi_p24_p08);
+
+  x0 = _mm256_unpacklo_epi16(u2, u5);
+  x1 = _mm256_unpackhi_epi16(u2, u5);
+
+  s2 = butter_fly(x0, x1, cospi_m24_m08);
+  s5 = butter_fly(x0, x1, cospi_m08_p24);
+
+  // stage 5
+  u0 = _mm256_add_epi16(s0, s1);
+  u1 = _mm256_sub_epi16(s0, s1);
+  u2 = _mm256_sub_epi16(s3, s2);
+  u3 = _mm256_add_epi16(s3, s2);
+  u4 = _mm256_add_epi16(s4, s5);
+  u5 = _mm256_sub_epi16(s4, s5);
+  u6 = _mm256_sub_epi16(s7, s6);
+  u7 = _mm256_add_epi16(s7, s6);
+
+  // stage 6
+  x0 = _mm256_unpacklo_epi16(u0, u7);
+  x1 = _mm256_unpackhi_epi16(u0, u7);
+  in[1] = butter_fly(x0, x1, cospi_p30_p02);
+  in[15] = butter_fly(x0, x1, cospi_m02_p30);
+
+  x0 = _mm256_unpacklo_epi16(u1, u6);
+  x1 = _mm256_unpackhi_epi16(u1, u6);
+  in[9] = butter_fly(x0, x1, cospi_p14_p18);
+  in[7] = butter_fly(x0, x1, cospi_m18_p14);
+
+  x0 = _mm256_unpacklo_epi16(u2, u5);
+  x1 = _mm256_unpackhi_epi16(u2, u5);
+  in[5] = butter_fly(x0, x1, cospi_p22_p10);
+  in[11] = butter_fly(x0, x1, cospi_m10_p22);
+
+  x0 = _mm256_unpacklo_epi16(u3, u4);
+  x1 = _mm256_unpackhi_epi16(u3, u4);
+  in[13] = butter_fly(x0, x1, cospi_p06_p26);
+  in[3] = butter_fly(x0, x1, cospi_m26_p06);
+}
+
+void fadst16_avx2(__m256i *in) {
+  const __m256i cospi_p01_p31 = pair256_set_epi16(cospi_1_64, cospi_31_64);
+  const __m256i cospi_p31_m01 = pair256_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m256i cospi_p05_p27 = pair256_set_epi16(cospi_5_64, cospi_27_64);
+  const __m256i cospi_p27_m05 = pair256_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m256i cospi_p09_p23 = pair256_set_epi16(cospi_9_64, cospi_23_64);
+  const __m256i cospi_p23_m09 = pair256_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m256i cospi_p13_p19 = pair256_set_epi16(cospi_13_64, cospi_19_64);
+  const __m256i cospi_p19_m13 = pair256_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m256i cospi_p17_p15 = pair256_set_epi16(cospi_17_64, cospi_15_64);
+  const __m256i cospi_p15_m17 = pair256_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m256i cospi_p21_p11 = pair256_set_epi16(cospi_21_64, cospi_11_64);
+  const __m256i cospi_p11_m21 = pair256_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m256i cospi_p25_p07 = pair256_set_epi16(cospi_25_64, cospi_7_64);
+  const __m256i cospi_p07_m25 = pair256_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m256i cospi_p29_p03 = pair256_set_epi16(cospi_29_64, cospi_3_64);
+  const __m256i cospi_p03_m29 = pair256_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m256i cospi_p04_p28 = pair256_set_epi16(cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_m04 = pair256_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m256i cospi_p20_p12 = pair256_set_epi16(cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_m20 = pair256_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m256i cospi_m28_p04 = pair256_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m256i cospi_m12_p20 = pair256_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m256i cospi_p08_p24 = pair256_set_epi16(cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_m08 = pair256_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m24_p08 = pair256_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m256i cospi_m16_m16 = _mm256_set1_epi16((int16_t)-cospi_16_64);
+  const __m256i cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i cospi_p16_m16 = pair256_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m256i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m256i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m256i y0, y1;
+
+  // stage 1, s takes low 256 bits; x takes high 256 bits
+  y0 = _mm256_unpacklo_epi16(in[15], in[0]);
+  y1 = _mm256_unpackhi_epi16(in[15], in[0]);
+  s0 = _mm256_madd_epi16(y0, cospi_p01_p31);
+  x0 = _mm256_madd_epi16(y1, cospi_p01_p31);
+  s1 = _mm256_madd_epi16(y0, cospi_p31_m01);
+  x1 = _mm256_madd_epi16(y1, cospi_p31_m01);
+
+  y0 = _mm256_unpacklo_epi16(in[13], in[2]);
+  y1 = _mm256_unpackhi_epi16(in[13], in[2]);
+  s2 = _mm256_madd_epi16(y0, cospi_p05_p27);
+  x2 = _mm256_madd_epi16(y1, cospi_p05_p27);
+  s3 = _mm256_madd_epi16(y0, cospi_p27_m05);
+  x3 = _mm256_madd_epi16(y1, cospi_p27_m05);
+
+  y0 = _mm256_unpacklo_epi16(in[11], in[4]);
+  y1 = _mm256_unpackhi_epi16(in[11], in[4]);
+  s4 = _mm256_madd_epi16(y0, cospi_p09_p23);
+  x4 = _mm256_madd_epi16(y1, cospi_p09_p23);
+  s5 = _mm256_madd_epi16(y0, cospi_p23_m09);
+  x5 = _mm256_madd_epi16(y1, cospi_p23_m09);
+
+  y0 = _mm256_unpacklo_epi16(in[9], in[6]);
+  y1 = _mm256_unpackhi_epi16(in[9], in[6]);
+  s6 = _mm256_madd_epi16(y0, cospi_p13_p19);
+  x6 = _mm256_madd_epi16(y1, cospi_p13_p19);
+  s7 = _mm256_madd_epi16(y0, cospi_p19_m13);
+  x7 = _mm256_madd_epi16(y1, cospi_p19_m13);
+
+  y0 = _mm256_unpacklo_epi16(in[7], in[8]);
+  y1 = _mm256_unpackhi_epi16(in[7], in[8]);
+  s8 = _mm256_madd_epi16(y0, cospi_p17_p15);
+  x8 = _mm256_madd_epi16(y1, cospi_p17_p15);
+  s9 = _mm256_madd_epi16(y0, cospi_p15_m17);
+  x9 = _mm256_madd_epi16(y1, cospi_p15_m17);
+
+  y0 = _mm256_unpacklo_epi16(in[5], in[10]);
+  y1 = _mm256_unpackhi_epi16(in[5], in[10]);
+  s10 = _mm256_madd_epi16(y0, cospi_p21_p11);
+  x10 = _mm256_madd_epi16(y1, cospi_p21_p11);
+  s11 = _mm256_madd_epi16(y0, cospi_p11_m21);
+  x11 = _mm256_madd_epi16(y1, cospi_p11_m21);
+
+  y0 = _mm256_unpacklo_epi16(in[3], in[12]);
+  y1 = _mm256_unpackhi_epi16(in[3], in[12]);
+  s12 = _mm256_madd_epi16(y0, cospi_p25_p07);
+  x12 = _mm256_madd_epi16(y1, cospi_p25_p07);
+  s13 = _mm256_madd_epi16(y0, cospi_p07_m25);
+  x13 = _mm256_madd_epi16(y1, cospi_p07_m25);
+
+  y0 = _mm256_unpacklo_epi16(in[1], in[14]);
+  y1 = _mm256_unpackhi_epi16(in[1], in[14]);
+  s14 = _mm256_madd_epi16(y0, cospi_p29_p03);
+  x14 = _mm256_madd_epi16(y1, cospi_p29_p03);
+  s15 = _mm256_madd_epi16(y0, cospi_p03_m29);
+  x15 = _mm256_madd_epi16(y1, cospi_p03_m29);
+
+  // u takes low 256 bits; v takes high 256 bits
+  u0 = _mm256_add_epi32(s0, s8);
+  u1 = _mm256_add_epi32(s1, s9);
+  u2 = _mm256_add_epi32(s2, s10);
+  u3 = _mm256_add_epi32(s3, s11);
+  u4 = _mm256_add_epi32(s4, s12);
+  u5 = _mm256_add_epi32(s5, s13);
+  u6 = _mm256_add_epi32(s6, s14);
+  u7 = _mm256_add_epi32(s7, s15);
+
+  u8 = _mm256_sub_epi32(s0, s8);
+  u9 = _mm256_sub_epi32(s1, s9);
+  u10 = _mm256_sub_epi32(s2, s10);
+  u11 = _mm256_sub_epi32(s3, s11);
+  u12 = _mm256_sub_epi32(s4, s12);
+  u13 = _mm256_sub_epi32(s5, s13);
+  u14 = _mm256_sub_epi32(s6, s14);
+  u15 = _mm256_sub_epi32(s7, s15);
+
+  v0 = _mm256_add_epi32(x0, x8);
+  v1 = _mm256_add_epi32(x1, x9);
+  v2 = _mm256_add_epi32(x2, x10);
+  v3 = _mm256_add_epi32(x3, x11);
+  v4 = _mm256_add_epi32(x4, x12);
+  v5 = _mm256_add_epi32(x5, x13);
+  v6 = _mm256_add_epi32(x6, x14);
+  v7 = _mm256_add_epi32(x7, x15);
+
+  v8 = _mm256_sub_epi32(x0, x8);
+  v9 = _mm256_sub_epi32(x1, x9);
+  v10 = _mm256_sub_epi32(x2, x10);
+  v11 = _mm256_sub_epi32(x3, x11);
+  v12 = _mm256_sub_epi32(x4, x12);
+  v13 = _mm256_sub_epi32(x5, x13);
+  v14 = _mm256_sub_epi32(x6, x14);
+  v15 = _mm256_sub_epi32(x7, x15);
+
+  // low 256 bits rounding
+  u0 = _mm256_add_epi32(u0, dct_rounding);
+  u1 = _mm256_add_epi32(u1, dct_rounding);
+  u2 = _mm256_add_epi32(u2, dct_rounding);
+  u3 = _mm256_add_epi32(u3, dct_rounding);
+  u4 = _mm256_add_epi32(u4, dct_rounding);
+  u5 = _mm256_add_epi32(u5, dct_rounding);
+  u6 = _mm256_add_epi32(u6, dct_rounding);
+  u7 = _mm256_add_epi32(u7, dct_rounding);
+
+  u0 = _mm256_srai_epi32(u0, DCT_CONST_BITS);
+  u1 = _mm256_srai_epi32(u1, DCT_CONST_BITS);
+  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+  u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
+
+  u8 = _mm256_add_epi32(u8, dct_rounding);
+  u9 = _mm256_add_epi32(u9, dct_rounding);
+  u10 = _mm256_add_epi32(u10, dct_rounding);
+  u11 = _mm256_add_epi32(u11, dct_rounding);
+  u12 = _mm256_add_epi32(u12, dct_rounding);
+  u13 = _mm256_add_epi32(u13, dct_rounding);
+  u14 = _mm256_add_epi32(u14, dct_rounding);
+  u15 = _mm256_add_epi32(u15, dct_rounding);
+
+  u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS);
+  u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS);
+  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
+  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
+  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  // high 256 bits rounding
+  v0 = _mm256_add_epi32(v0, dct_rounding);
+  v1 = _mm256_add_epi32(v1, dct_rounding);
+  v2 = _mm256_add_epi32(v2, dct_rounding);
+  v3 = _mm256_add_epi32(v3, dct_rounding);
+  v4 = _mm256_add_epi32(v4, dct_rounding);
+  v5 = _mm256_add_epi32(v5, dct_rounding);
+  v6 = _mm256_add_epi32(v6, dct_rounding);
+  v7 = _mm256_add_epi32(v7, dct_rounding);
+
+  v0 = _mm256_srai_epi32(v0, DCT_CONST_BITS);
+  v1 = _mm256_srai_epi32(v1, DCT_CONST_BITS);
+  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+  v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
+
+  v8 = _mm256_add_epi32(v8, dct_rounding);
+  v9 = _mm256_add_epi32(v9, dct_rounding);
+  v10 = _mm256_add_epi32(v10, dct_rounding);
+  v11 = _mm256_add_epi32(v11, dct_rounding);
+  v12 = _mm256_add_epi32(v12, dct_rounding);
+  v13 = _mm256_add_epi32(v13, dct_rounding);
+  v14 = _mm256_add_epi32(v14, dct_rounding);
+  v15 = _mm256_add_epi32(v15, dct_rounding);
+
+  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
+  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
+  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  // Saturation pack 32-bit to 16-bit
+  x0 = _mm256_packs_epi32(u0, v0);
+  x1 = _mm256_packs_epi32(u1, v1);
+  x2 = _mm256_packs_epi32(u2, v2);
+  x3 = _mm256_packs_epi32(u3, v3);
+  x4 = _mm256_packs_epi32(u4, v4);
+  x5 = _mm256_packs_epi32(u5, v5);
+  x6 = _mm256_packs_epi32(u6, v6);
+  x7 = _mm256_packs_epi32(u7, v7);
+  x8 = _mm256_packs_epi32(u8, v8);
+  x9 = _mm256_packs_epi32(u9, v9);
+  x10 = _mm256_packs_epi32(u10, v10);
+  x11 = _mm256_packs_epi32(u11, v11);
+  x12 = _mm256_packs_epi32(u12, v12);
+  x13 = _mm256_packs_epi32(u13, v13);
+  x14 = _mm256_packs_epi32(u14, v14);
+  x15 = _mm256_packs_epi32(u15, v15);
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+
+  y0 = _mm256_unpacklo_epi16(x8, x9);
+  y1 = _mm256_unpackhi_epi16(x8, x9);
+  s8 = _mm256_madd_epi16(y0, cospi_p04_p28);
+  x8 = _mm256_madd_epi16(y1, cospi_p04_p28);
+  s9 = _mm256_madd_epi16(y0, cospi_p28_m04);
+  x9 = _mm256_madd_epi16(y1, cospi_p28_m04);
+
+  y0 = _mm256_unpacklo_epi16(x10, x11);
+  y1 = _mm256_unpackhi_epi16(x10, x11);
+  s10 = _mm256_madd_epi16(y0, cospi_p20_p12);
+  x10 = _mm256_madd_epi16(y1, cospi_p20_p12);
+  s11 = _mm256_madd_epi16(y0, cospi_p12_m20);
+  x11 = _mm256_madd_epi16(y1, cospi_p12_m20);
+
+  y0 = _mm256_unpacklo_epi16(x12, x13);
+  y1 = _mm256_unpackhi_epi16(x12, x13);
+  s12 = _mm256_madd_epi16(y0, cospi_m28_p04);
+  x12 = _mm256_madd_epi16(y1, cospi_m28_p04);
+  s13 = _mm256_madd_epi16(y0, cospi_p04_p28);
+  x13 = _mm256_madd_epi16(y1, cospi_p04_p28);
+
+  y0 = _mm256_unpacklo_epi16(x14, x15);
+  y1 = _mm256_unpackhi_epi16(x14, x15);
+  s14 = _mm256_madd_epi16(y0, cospi_m12_p20);
+  x14 = _mm256_madd_epi16(y1, cospi_m12_p20);
+  s15 = _mm256_madd_epi16(y0, cospi_p20_p12);
+  x15 = _mm256_madd_epi16(y1, cospi_p20_p12);
+
+  x0 = _mm256_add_epi16(s0, s4);
+  x1 = _mm256_add_epi16(s1, s5);
+  x2 = _mm256_add_epi16(s2, s6);
+  x3 = _mm256_add_epi16(s3, s7);
+  x4 = _mm256_sub_epi16(s0, s4);
+  x5 = _mm256_sub_epi16(s1, s5);
+  x6 = _mm256_sub_epi16(s2, s6);
+  x7 = _mm256_sub_epi16(s3, s7);
+
+  u8 = _mm256_add_epi32(s8, s12);
+  u9 = _mm256_add_epi32(s9, s13);
+  u10 = _mm256_add_epi32(s10, s14);
+  u11 = _mm256_add_epi32(s11, s15);
+  u12 = _mm256_sub_epi32(s8, s12);
+  u13 = _mm256_sub_epi32(s9, s13);
+  u14 = _mm256_sub_epi32(s10, s14);
+  u15 = _mm256_sub_epi32(s11, s15);
+
+  v8 = _mm256_add_epi32(x8, x12);
+  v9 = _mm256_add_epi32(x9, x13);
+  v10 = _mm256_add_epi32(x10, x14);
+  v11 = _mm256_add_epi32(x11, x15);
+  v12 = _mm256_sub_epi32(x8, x12);
+  v13 = _mm256_sub_epi32(x9, x13);
+  v14 = _mm256_sub_epi32(x10, x14);
+  v15 = _mm256_sub_epi32(x11, x15);
+
+  u8 = _mm256_add_epi32(u8, dct_rounding);
+  u9 = _mm256_add_epi32(u9, dct_rounding);
+  u10 = _mm256_add_epi32(u10, dct_rounding);
+  u11 = _mm256_add_epi32(u11, dct_rounding);
+  u12 = _mm256_add_epi32(u12, dct_rounding);
+  u13 = _mm256_add_epi32(u13, dct_rounding);
+  u14 = _mm256_add_epi32(u14, dct_rounding);
+  u15 = _mm256_add_epi32(u15, dct_rounding);
+
+  u8 = _mm256_srai_epi32(u8, DCT_CONST_BITS);
+  u9 = _mm256_srai_epi32(u9, DCT_CONST_BITS);
+  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
+  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
+  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  v8 = _mm256_add_epi32(v8, dct_rounding);
+  v9 = _mm256_add_epi32(v9, dct_rounding);
+  v10 = _mm256_add_epi32(v10, dct_rounding);
+  v11 = _mm256_add_epi32(v11, dct_rounding);
+  v12 = _mm256_add_epi32(v12, dct_rounding);
+  v13 = _mm256_add_epi32(v13, dct_rounding);
+  v14 = _mm256_add_epi32(v14, dct_rounding);
+  v15 = _mm256_add_epi32(v15, dct_rounding);
+
+  v8 = _mm256_srai_epi32(v8, DCT_CONST_BITS);
+  v9 = _mm256_srai_epi32(v9, DCT_CONST_BITS);
+  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  x8 = _mm256_packs_epi32(u8, v8);
+  x9 = _mm256_packs_epi32(u9, v9);
+  x10 = _mm256_packs_epi32(u10, v10);
+  x11 = _mm256_packs_epi32(u11, v11);
+  x12 = _mm256_packs_epi32(u12, v12);
+  x13 = _mm256_packs_epi32(u13, v13);
+  x14 = _mm256_packs_epi32(u14, v14);
+  x15 = _mm256_packs_epi32(u15, v15);
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+
+  y0 = _mm256_unpacklo_epi16(x4, x5);
+  y1 = _mm256_unpackhi_epi16(x4, x5);
+  s4 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x4 = _mm256_madd_epi16(y1, cospi_p08_p24);
+  s5 = _mm256_madd_epi16(y0, cospi_p24_m08);
+  x5 = _mm256_madd_epi16(y1, cospi_p24_m08);
+
+  y0 = _mm256_unpacklo_epi16(x6, x7);
+  y1 = _mm256_unpackhi_epi16(x6, x7);
+  s6 = _mm256_madd_epi16(y0, cospi_m24_p08);
+  x6 = _mm256_madd_epi16(y1, cospi_m24_p08);
+  s7 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x7 = _mm256_madd_epi16(y1, cospi_p08_p24);
+
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+
+  y0 = _mm256_unpacklo_epi16(x12, x13);
+  y1 = _mm256_unpackhi_epi16(x12, x13);
+  s12 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x12 = _mm256_madd_epi16(y1, cospi_p08_p24);
+  s13 = _mm256_madd_epi16(y0, cospi_p24_m08);
+  x13 = _mm256_madd_epi16(y1, cospi_p24_m08);
+
+  y0 = _mm256_unpacklo_epi16(x14, x15);
+  y1 = _mm256_unpackhi_epi16(x14, x15);
+  s14 = _mm256_madd_epi16(y0, cospi_m24_p08);
+  x14 = _mm256_madd_epi16(y1, cospi_m24_p08);
+  s15 = _mm256_madd_epi16(y0, cospi_p08_p24);
+  x15 = _mm256_madd_epi16(y1, cospi_p08_p24);
+
+  in[0] = _mm256_add_epi16(s0, s2);
+  x1 = _mm256_add_epi16(s1, s3);
+  x2 = _mm256_sub_epi16(s0, s2);
+  x3 = _mm256_sub_epi16(s1, s3);
+
+  // Rounding on s4 + s6, s5 + s7, s4 - s6, s5 - s7
+  u4 = _mm256_add_epi32(s4, s6);
+  u5 = _mm256_add_epi32(s5, s7);
+  u6 = _mm256_sub_epi32(s4, s6);
+  u7 = _mm256_sub_epi32(s5, s7);
+
+  v4 = _mm256_add_epi32(x4, x6);
+  v5 = _mm256_add_epi32(x5, x7);
+  v6 = _mm256_sub_epi32(x4, x6);
+  v7 = _mm256_sub_epi32(x5, x7);
+
+  u4 = _mm256_add_epi32(u4, dct_rounding);
+  u5 = _mm256_add_epi32(u5, dct_rounding);
+  u6 = _mm256_add_epi32(u6, dct_rounding);
+  u7 = _mm256_add_epi32(u7, dct_rounding);
+
+  u4 = _mm256_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm256_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
+
+  v4 = _mm256_add_epi32(v4, dct_rounding);
+  v5 = _mm256_add_epi32(v5, dct_rounding);
+  v6 = _mm256_add_epi32(v6, dct_rounding);
+  v7 = _mm256_add_epi32(v7, dct_rounding);
+
+  v4 = _mm256_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm256_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
+
+  x4 = _mm256_packs_epi32(u4, v4);
+  in[12] = _mm256_packs_epi32(u5, v5);
+  x6 = _mm256_packs_epi32(u6, v6);
+  x7 = _mm256_packs_epi32(u7, v7);
+
+  x8 = _mm256_add_epi16(s8, s10);
+  in[14] = _mm256_add_epi16(s9, s11);
+  x10 = _mm256_sub_epi16(s8, s10);
+  x11 = _mm256_sub_epi16(s9, s11);
+
+  // Rounding on s12 + s14, s13 + s15, s12 - s14, s13 - s15
+  u12 = _mm256_add_epi32(s12, s14);
+  u13 = _mm256_add_epi32(s13, s15);
+  u14 = _mm256_sub_epi32(s12, s14);
+  u15 = _mm256_sub_epi32(s13, s15);
+
+  v12 = _mm256_add_epi32(x12, x14);
+  v13 = _mm256_add_epi32(x13, x15);
+  v14 = _mm256_sub_epi32(x12, x14);
+  v15 = _mm256_sub_epi32(x13, x15);
+
+  u12 = _mm256_add_epi32(u12, dct_rounding);
+  u13 = _mm256_add_epi32(u13, dct_rounding);
+  u14 = _mm256_add_epi32(u14, dct_rounding);
+  u15 = _mm256_add_epi32(u15, dct_rounding);
+
+  u12 = _mm256_srai_epi32(u12, DCT_CONST_BITS);
+  u13 = _mm256_srai_epi32(u13, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  v12 = _mm256_add_epi32(v12, dct_rounding);
+  v13 = _mm256_add_epi32(v13, dct_rounding);
+  v14 = _mm256_add_epi32(v14, dct_rounding);
+  v15 = _mm256_add_epi32(v15, dct_rounding);
+
+  v12 = _mm256_srai_epi32(v12, DCT_CONST_BITS);
+  v13 = _mm256_srai_epi32(v13, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  x12 = _mm256_packs_epi32(u12, v12);
+  x13 = _mm256_packs_epi32(u13, v13);
+  x14 = _mm256_packs_epi32(u14, v14);
+  x15 = _mm256_packs_epi32(u15, v15);
+  in[2] = x12;
+
+  // stage 4
+  y0 = _mm256_unpacklo_epi16(x2, x3);
+  y1 = _mm256_unpackhi_epi16(x2, x3);
+  s2 = _mm256_madd_epi16(y0, cospi_m16_m16);
+  x2 = _mm256_madd_epi16(y1, cospi_m16_m16);
+  s3 = _mm256_madd_epi16(y0, cospi_p16_m16);
+  x3 = _mm256_madd_epi16(y1, cospi_p16_m16);
+
+  y0 = _mm256_unpacklo_epi16(x6, x7);
+  y1 = _mm256_unpackhi_epi16(x6, x7);
+  s6 = _mm256_madd_epi16(y0, cospi_p16_p16);
+  x6 = _mm256_madd_epi16(y1, cospi_p16_p16);
+  s7 = _mm256_madd_epi16(y0, cospi_m16_p16);
+  x7 = _mm256_madd_epi16(y1, cospi_m16_p16);
+
+  y0 = _mm256_unpacklo_epi16(x10, x11);
+  y1 = _mm256_unpackhi_epi16(x10, x11);
+  s10 = _mm256_madd_epi16(y0, cospi_p16_p16);
+  x10 = _mm256_madd_epi16(y1, cospi_p16_p16);
+  s11 = _mm256_madd_epi16(y0, cospi_m16_p16);
+  x11 = _mm256_madd_epi16(y1, cospi_m16_p16);
+
+  y0 = _mm256_unpacklo_epi16(x14, x15);
+  y1 = _mm256_unpackhi_epi16(x14, x15);
+  s14 = _mm256_madd_epi16(y0, cospi_m16_m16);
+  x14 = _mm256_madd_epi16(y1, cospi_m16_m16);
+  s15 = _mm256_madd_epi16(y0, cospi_p16_m16);
+  x15 = _mm256_madd_epi16(y1, cospi_p16_m16);
+
+  // Rounding
+  u2 = _mm256_add_epi32(s2, dct_rounding);
+  u3 = _mm256_add_epi32(s3, dct_rounding);
+  u6 = _mm256_add_epi32(s6, dct_rounding);
+  u7 = _mm256_add_epi32(s7, dct_rounding);
+
+  u10 = _mm256_add_epi32(s10, dct_rounding);
+  u11 = _mm256_add_epi32(s11, dct_rounding);
+  u14 = _mm256_add_epi32(s14, dct_rounding);
+  u15 = _mm256_add_epi32(s15, dct_rounding);
+
+  u2 = _mm256_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm256_srai_epi32(u3, DCT_CONST_BITS);
+  u6 = _mm256_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm256_srai_epi32(u7, DCT_CONST_BITS);
+
+  u10 = _mm256_srai_epi32(u10, DCT_CONST_BITS);
+  u11 = _mm256_srai_epi32(u11, DCT_CONST_BITS);
+  u14 = _mm256_srai_epi32(u14, DCT_CONST_BITS);
+  u15 = _mm256_srai_epi32(u15, DCT_CONST_BITS);
+
+  v2 = _mm256_add_epi32(x2, dct_rounding);
+  v3 = _mm256_add_epi32(x3, dct_rounding);
+  v6 = _mm256_add_epi32(x6, dct_rounding);
+  v7 = _mm256_add_epi32(x7, dct_rounding);
+
+  v10 = _mm256_add_epi32(x10, dct_rounding);
+  v11 = _mm256_add_epi32(x11, dct_rounding);
+  v14 = _mm256_add_epi32(x14, dct_rounding);
+  v15 = _mm256_add_epi32(x15, dct_rounding);
+
+  v2 = _mm256_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm256_srai_epi32(v3, DCT_CONST_BITS);
+  v6 = _mm256_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm256_srai_epi32(v7, DCT_CONST_BITS);
+
+  v10 = _mm256_srai_epi32(v10, DCT_CONST_BITS);
+  v11 = _mm256_srai_epi32(v11, DCT_CONST_BITS);
+  v14 = _mm256_srai_epi32(v14, DCT_CONST_BITS);
+  v15 = _mm256_srai_epi32(v15, DCT_CONST_BITS);
+
+  in[7] = _mm256_packs_epi32(u2, v2);
+  in[8] = _mm256_packs_epi32(u3, v3);
+
+  in[4] = _mm256_packs_epi32(u6, v6);
+  in[11] = _mm256_packs_epi32(u7, v7);
+
+  in[6] = _mm256_packs_epi32(u10, v10);
+  in[9] = _mm256_packs_epi32(u11, v11);
+
+  in[5] = _mm256_packs_epi32(u14, v14);
+  in[10] = _mm256_packs_epi32(u15, v15);
+
+  in[1] = _mm256_sub_epi16(zero, x8);
+  in[3] = _mm256_sub_epi16(zero, x4);
+  in[13] = _mm256_sub_epi16(zero, x13);
+  in[15] = _mm256_sub_epi16(zero, x1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx16_avx2(__m256i *in) { txfm_scaling16_avx2(Sqrt2, in); }
+#endif
+
+void av1_fht16x16_avx2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m256i in[16];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case ADST_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case ADST_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, stride, 1, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, stride, 0, 1, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, stride, 1, 1, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, stride, 0, 1, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, stride, 1, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fdct16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fdct16_avx2(in);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, stride, 0, 0, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, stride, 1, 0, in);
+      fadst16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fidtx16_avx2(in);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, stride, 0, 1, in);
+      fidtx16_avx2(in);
+      mm256_transpose_16x16(in);
+      right_shift_16x16(in);
+      fadst16_avx2(in);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  mm256_transpose_16x16(in);
+  write_buffer_16x16(in, output);
+  _mm256_zeroupper();
+}
+
+void aom_fdct32x32_1_avx2(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  // left and upper corner
+  int32_t sum = get_16x16_sum(input, stride);
+  // right and upper corner
+  sum += get_16x16_sum(input + 16, stride);
+  // left and lower corner
+  sum += get_16x16_sum(input + (stride << 4), stride);
+  // right and lower corner
+  sum += get_16x16_sum(input + (stride << 4) + 16, stride);
+
+  sum >>= 3;
+  output[0] = (tran_low_t)sum;
+  _mm256_zeroupper();
+}
+
+static void mm256_vectors_swap(__m256i *a0, __m256i *a1, const int size) {
+  int i = 0;
+  __m256i temp;
+  while (i < size) {
+    temp = a0[i];
+    a0[i] = a1[i];
+    a1[i] = temp;
+    i++;
+  }
+}
+
+static void mm256_transpose_32x32(__m256i *in0, __m256i *in1) {
+  mm256_transpose_16x16(in0);
+  mm256_transpose_16x16(&in0[16]);
+  mm256_transpose_16x16(in1);
+  mm256_transpose_16x16(&in1[16]);
+  mm256_vectors_swap(&in0[16], in1, 16);
+}
+
+static void prepare_16x16_even(const __m256i *in, __m256i *even) {
+  even[0] = _mm256_add_epi16(in[0], in[31]);
+  even[1] = _mm256_add_epi16(in[1], in[30]);
+  even[2] = _mm256_add_epi16(in[2], in[29]);
+  even[3] = _mm256_add_epi16(in[3], in[28]);
+  even[4] = _mm256_add_epi16(in[4], in[27]);
+  even[5] = _mm256_add_epi16(in[5], in[26]);
+  even[6] = _mm256_add_epi16(in[6], in[25]);
+  even[7] = _mm256_add_epi16(in[7], in[24]);
+  even[8] = _mm256_add_epi16(in[8], in[23]);
+  even[9] = _mm256_add_epi16(in[9], in[22]);
+  even[10] = _mm256_add_epi16(in[10], in[21]);
+  even[11] = _mm256_add_epi16(in[11], in[20]);
+  even[12] = _mm256_add_epi16(in[12], in[19]);
+  even[13] = _mm256_add_epi16(in[13], in[18]);
+  even[14] = _mm256_add_epi16(in[14], in[17]);
+  even[15] = _mm256_add_epi16(in[15], in[16]);
+}
+
+static void prepare_16x16_odd(const __m256i *in, __m256i *odd) {
+  odd[0] = _mm256_sub_epi16(in[15], in[16]);
+  odd[1] = _mm256_sub_epi16(in[14], in[17]);
+  odd[2] = _mm256_sub_epi16(in[13], in[18]);
+  odd[3] = _mm256_sub_epi16(in[12], in[19]);
+  odd[4] = _mm256_sub_epi16(in[11], in[20]);
+  odd[5] = _mm256_sub_epi16(in[10], in[21]);
+  odd[6] = _mm256_sub_epi16(in[9], in[22]);
+  odd[7] = _mm256_sub_epi16(in[8], in[23]);
+  odd[8] = _mm256_sub_epi16(in[7], in[24]);
+  odd[9] = _mm256_sub_epi16(in[6], in[25]);
+  odd[10] = _mm256_sub_epi16(in[5], in[26]);
+  odd[11] = _mm256_sub_epi16(in[4], in[27]);
+  odd[12] = _mm256_sub_epi16(in[3], in[28]);
+  odd[13] = _mm256_sub_epi16(in[2], in[29]);
+  odd[14] = _mm256_sub_epi16(in[1], in[30]);
+  odd[15] = _mm256_sub_epi16(in[0], in[31]);
+}
+
+static void collect_16col(const __m256i *even, const __m256i *odd,
+                          __m256i *out) {
+  // fdct16_avx2() already maps the output
+  out[0] = even[0];
+  out[2] = even[1];
+  out[4] = even[2];
+  out[6] = even[3];
+  out[8] = even[4];
+  out[10] = even[5];
+  out[12] = even[6];
+  out[14] = even[7];
+  out[16] = even[8];
+  out[18] = even[9];
+  out[20] = even[10];
+  out[22] = even[11];
+  out[24] = even[12];
+  out[26] = even[13];
+  out[28] = even[14];
+  out[30] = even[15];
+
+  out[1] = odd[0];
+  out[17] = odd[1];
+  out[9] = odd[2];
+  out[25] = odd[3];
+  out[5] = odd[4];
+  out[21] = odd[5];
+  out[13] = odd[6];
+  out[29] = odd[7];
+  out[3] = odd[8];
+  out[19] = odd[9];
+  out[11] = odd[10];
+  out[27] = odd[11];
+  out[7] = odd[12];
+  out[23] = odd[13];
+  out[15] = odd[14];
+  out[31] = odd[15];
+}
+
+static void collect_coeffs(const __m256i *first_16col_even,
+                           const __m256i *first_16col_odd,
+                           const __m256i *second_16col_even,
+                           const __m256i *second_16col_odd, __m256i *in0,
+                           __m256i *in1) {
+  collect_16col(first_16col_even, first_16col_odd, in0);
+  collect_16col(second_16col_even, second_16col_odd, in1);
+}
+
+static void fdct16_odd_avx2(__m256i *in) {
+  // sequence: cospi_L_H = pairs(L, H) and L first
+  const __m256i cospi_p16_p16 = pair256_set_epi16(cospi_16_64, cospi_16_64);
+  const __m256i cospi_m16_p16 = pair256_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m256i cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m256i cospi_p24_p08 = pair256_set_epi16(cospi_24_64, cospi_8_64);
+  const __m256i cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i cospi_m04_p28 = pair256_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m256i cospi_p28_p04 = pair256_set_epi16(cospi_28_64, cospi_4_64);
+  const __m256i cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i cospi_m20_p12 = pair256_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m256i cospi_p12_p20 = pair256_set_epi16(cospi_12_64, cospi_20_64);
+  const __m256i cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+
+  const __m256i cospi_p31_p01 = pair256_set_epi16(cospi_31_64, cospi_1_64);
+  const __m256i cospi_m01_p31 = pair256_set_epi16(-cospi_1_64, cospi_31_64);
+  const __m256i cospi_p15_p17 = pair256_set_epi16(cospi_15_64, cospi_17_64);
+  const __m256i cospi_m17_p15 = pair256_set_epi16(-cospi_17_64, cospi_15_64);
+  const __m256i cospi_p23_p09 = pair256_set_epi16(cospi_23_64, cospi_9_64);
+  const __m256i cospi_m09_p23 = pair256_set_epi16(-cospi_9_64, cospi_23_64);
+  const __m256i cospi_p07_p25 = pair256_set_epi16(cospi_7_64, cospi_25_64);
+  const __m256i cospi_m25_p07 = pair256_set_epi16(-cospi_25_64, cospi_7_64);
+  const __m256i cospi_p27_p05 = pair256_set_epi16(cospi_27_64, cospi_5_64);
+  const __m256i cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
+  const __m256i cospi_p11_p21 = pair256_set_epi16(cospi_11_64, cospi_21_64);
+  const __m256i cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
+  const __m256i cospi_p19_p13 = pair256_set_epi16(cospi_19_64, cospi_13_64);
+  const __m256i cospi_m13_p19 = pair256_set_epi16(-cospi_13_64, cospi_19_64);
+  const __m256i cospi_p03_p29 = pair256_set_epi16(cospi_3_64, cospi_29_64);
+  const __m256i cospi_m29_p03 = pair256_set_epi16(-cospi_29_64, cospi_3_64);
+
+  __m256i x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+  __m256i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15;
+  __m256i u0, u1;
+
+  // stage 1 is in prepare_16x16_odd()
+
+  // stage 2
+  y0 = in[0];
+  y1 = in[1];
+  y2 = in[2];
+  y3 = in[3];
+
+  u0 = _mm256_unpacklo_epi16(in[4], in[11]);
+  u1 = _mm256_unpackhi_epi16(in[4], in[11]);
+  y4 = butter_fly(u0, u1, cospi_m16_p16);
+  y11 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[5], in[10]);
+  u1 = _mm256_unpackhi_epi16(in[5], in[10]);
+  y5 = butter_fly(u0, u1, cospi_m16_p16);
+  y10 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[6], in[9]);
+  u1 = _mm256_unpackhi_epi16(in[6], in[9]);
+  y6 = butter_fly(u0, u1, cospi_m16_p16);
+  y9 = butter_fly(u0, u1, cospi_p16_p16);
+
+  u0 = _mm256_unpacklo_epi16(in[7], in[8]);
+  u1 = _mm256_unpackhi_epi16(in[7], in[8]);
+  y7 = butter_fly(u0, u1, cospi_m16_p16);
+  y8 = butter_fly(u0, u1, cospi_p16_p16);
+
+  y12 = in[12];
+  y13 = in[13];
+  y14 = in[14];
+  y15 = in[15];
+
+  // stage 3
+  x0 = _mm256_add_epi16(y0, y7);
+  x1 = _mm256_add_epi16(y1, y6);
+  x2 = _mm256_add_epi16(y2, y5);
+  x3 = _mm256_add_epi16(y3, y4);
+  x4 = _mm256_sub_epi16(y3, y4);
+  x5 = _mm256_sub_epi16(y2, y5);
+  x6 = _mm256_sub_epi16(y1, y6);
+  x7 = _mm256_sub_epi16(y0, y7);
+  x8 = _mm256_sub_epi16(y15, y8);
+  x9 = _mm256_sub_epi16(y14, y9);
+  x10 = _mm256_sub_epi16(y13, y10);
+  x11 = _mm256_sub_epi16(y12, y11);
+  x12 = _mm256_add_epi16(y12, y11);
+  x13 = _mm256_add_epi16(y13, y10);
+  x14 = _mm256_add_epi16(y14, y9);
+  x15 = _mm256_add_epi16(y15, y8);
+
+  // stage 4
+  y0 = x0;
+  y1 = x1;
+  y6 = x6;
+  y7 = x7;
+  y8 = x8;
+  y9 = x9;
+  y14 = x14;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m08_p24);
+  y13 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  y3 = butter_fly(u0, u1, cospi_m08_p24);
+  y12 = butter_fly(u0, u1, cospi_p24_p08);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  y4 = butter_fly(u0, u1, cospi_m24_m08);
+  y11 = butter_fly(u0, u1, cospi_m08_p24);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m24_m08);
+  y10 = butter_fly(u0, u1, cospi_m08_p24);
+
+  // stage 5
+  x0 = _mm256_add_epi16(y0, y3);
+  x1 = _mm256_add_epi16(y1, y2);
+  x2 = _mm256_sub_epi16(y1, y2);
+  x3 = _mm256_sub_epi16(y0, y3);
+  x4 = _mm256_sub_epi16(y7, y4);
+  x5 = _mm256_sub_epi16(y6, y5);
+  x6 = _mm256_add_epi16(y6, y5);
+  x7 = _mm256_add_epi16(y7, y4);
+
+  x8 = _mm256_add_epi16(y8, y11);
+  x9 = _mm256_add_epi16(y9, y10);
+  x10 = _mm256_sub_epi16(y9, y10);
+  x11 = _mm256_sub_epi16(y8, y11);
+  x12 = _mm256_sub_epi16(y15, y12);
+  x13 = _mm256_sub_epi16(y14, y13);
+  x14 = _mm256_add_epi16(y14, y13);
+  x15 = _mm256_add_epi16(y15, y12);
+
+  // stage 6
+  y0 = x0;
+  y3 = x3;
+  y4 = x4;
+  y7 = x7;
+  y8 = x8;
+  y11 = x11;
+  y12 = x12;
+  y15 = x15;
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  y1 = butter_fly(u0, u1, cospi_m04_p28);
+  y14 = butter_fly(u0, u1, cospi_p28_p04);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  y2 = butter_fly(u0, u1, cospi_m28_m04);
+  y13 = butter_fly(u0, u1, cospi_m04_p28);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  y5 = butter_fly(u0, u1, cospi_m20_p12);
+  y10 = butter_fly(u0, u1, cospi_p12_p20);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  y6 = butter_fly(u0, u1, cospi_m12_m20);
+  y9 = butter_fly(u0, u1, cospi_m20_p12);
+
+  // stage 7
+  x0 = _mm256_add_epi16(y0, y1);
+  x1 = _mm256_sub_epi16(y0, y1);
+  x2 = _mm256_sub_epi16(y3, y2);
+  x3 = _mm256_add_epi16(y3, y2);
+  x4 = _mm256_add_epi16(y4, y5);
+  x5 = _mm256_sub_epi16(y4, y5);
+  x6 = _mm256_sub_epi16(y7, y6);
+  x7 = _mm256_add_epi16(y7, y6);
+
+  x8 = _mm256_add_epi16(y8, y9);
+  x9 = _mm256_sub_epi16(y8, y9);
+  x10 = _mm256_sub_epi16(y11, y10);
+  x11 = _mm256_add_epi16(y11, y10);
+  x12 = _mm256_add_epi16(y12, y13);
+  x13 = _mm256_sub_epi16(y12, y13);
+  x14 = _mm256_sub_epi16(y15, y14);
+  x15 = _mm256_add_epi16(y15, y14);
+
+  // stage 8
+  u0 = _mm256_unpacklo_epi16(x0, x15);
+  u1 = _mm256_unpackhi_epi16(x0, x15);
+  in[0] = butter_fly(u0, u1, cospi_p31_p01);
+  in[15] = butter_fly(u0, u1, cospi_m01_p31);
+
+  u0 = _mm256_unpacklo_epi16(x1, x14);
+  u1 = _mm256_unpackhi_epi16(x1, x14);
+  in[1] = butter_fly(u0, u1, cospi_p15_p17);
+  in[14] = butter_fly(u0, u1, cospi_m17_p15);
+
+  u0 = _mm256_unpacklo_epi16(x2, x13);
+  u1 = _mm256_unpackhi_epi16(x2, x13);
+  in[2] = butter_fly(u0, u1, cospi_p23_p09);
+  in[13] = butter_fly(u0, u1, cospi_m09_p23);
+
+  u0 = _mm256_unpacklo_epi16(x3, x12);
+  u1 = _mm256_unpackhi_epi16(x3, x12);
+  in[3] = butter_fly(u0, u1, cospi_p07_p25);
+  in[12] = butter_fly(u0, u1, cospi_m25_p07);
+
+  u0 = _mm256_unpacklo_epi16(x4, x11);
+  u1 = _mm256_unpackhi_epi16(x4, x11);
+  in[4] = butter_fly(u0, u1, cospi_p27_p05);
+  in[11] = butter_fly(u0, u1, cospi_m05_p27);
+
+  u0 = _mm256_unpacklo_epi16(x5, x10);
+  u1 = _mm256_unpackhi_epi16(x5, x10);
+  in[5] = butter_fly(u0, u1, cospi_p11_p21);
+  in[10] = butter_fly(u0, u1, cospi_m21_p11);
+
+  u0 = _mm256_unpacklo_epi16(x6, x9);
+  u1 = _mm256_unpackhi_epi16(x6, x9);
+  in[6] = butter_fly(u0, u1, cospi_p19_p13);
+  in[9] = butter_fly(u0, u1, cospi_m13_p19);
+
+  u0 = _mm256_unpacklo_epi16(x7, x8);
+  u1 = _mm256_unpackhi_epi16(x7, x8);
+  in[7] = butter_fly(u0, u1, cospi_p03_p29);
+  in[8] = butter_fly(u0, u1, cospi_m29_p03);
+}
+
+static void fdct32_avx2(__m256i *in0, __m256i *in1) {
+  __m256i even0[16], even1[16], odd0[16], odd1[16];
+  prepare_16x16_even(in0, even0);
+  fdct16_avx2(even0);
+
+  prepare_16x16_odd(in0, odd0);
+  fdct16_odd_avx2(odd0);
+
+  prepare_16x16_even(in1, even1);
+  fdct16_avx2(even1);
+
+  prepare_16x16_odd(in1, odd1);
+  fdct16_odd_avx2(odd1);
+
+  collect_coeffs(even0, odd0, even1, odd1, in0, in1);
+
+  mm256_transpose_32x32(in0, in1);
+}
+
+static INLINE void write_buffer_32x32(const __m256i *in0, const __m256i *in1,
+                                      tran_low_t *output) {
+  int i = 0;
+  const int stride = 32;
+  tran_low_t *coeff = output;
+  while (i < 32) {
+    storeu_output_avx2(&in0[i], coeff);
+    storeu_output_avx2(&in1[i], coeff + 16);
+    coeff += stride;
+    i += 1;
+  }
+}
+
+#if CONFIG_EXT_TX
+static void fhalfright32_16col_avx2(__m256i *in) {
+  int i = 0;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i sqrt2 = _mm256_set1_epi16(Sqrt2);
+  const __m256i dct_rounding = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  __m256i x0, x1;
+
+  while (i < 16) {
+    in[i] = _mm256_slli_epi16(in[i], 2);
+    x0 = _mm256_unpacklo_epi16(in[i + 16], zero);
+    x1 = _mm256_unpackhi_epi16(in[i + 16], zero);
+    x0 = _mm256_madd_epi16(x0, sqrt2);
+    x1 = _mm256_madd_epi16(x1, sqrt2);
+    x0 = _mm256_add_epi32(x0, dct_rounding);
+    x1 = _mm256_add_epi32(x1, dct_rounding);
+    x0 = _mm256_srai_epi32(x0, DCT_CONST_BITS);
+    x1 = _mm256_srai_epi32(x1, DCT_CONST_BITS);
+    in[i + 16] = _mm256_packs_epi32(x0, x1);
+    i += 1;
+  }
+  fdct16_avx2(&in[16]);
+}
+
+static void fhalfright32_avx2(__m256i *in0, __m256i *in1) {
+  fhalfright32_16col_avx2(in0);
+  fhalfright32_16col_avx2(in1);
+  mm256_vectors_swap(in0, &in0[16], 16);
+  mm256_vectors_swap(in1, &in1[16], 16);
+  mm256_transpose_32x32(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE void load_buffer_32x32(const int16_t *input, int stride,
+                                     int flipud, int fliplr, __m256i *in0,
+                                     __m256i *in1) {
+  // Load 4 16x16 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 16;
+  const int16_t *botL = input + 16 * stride;
+  const int16_t *botR = input + 16 * stride + 16;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL;
+    topL = botL;
+    botL = tmp;
+    // Swap right columns
+    tmp = topR;
+    topR = botR;
+    botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL;
+    topL = topR;
+    topR = tmp;
+    // Swap bottom rows
+    tmp = botL;
+    botL = botR;
+    botR = tmp;
+  }
+
+  // load first 16 columns
+  load_buffer_16x16(topL, stride, flipud, fliplr, in0);
+  load_buffer_16x16(botL, stride, flipud, fliplr, in0 + 16);
+
+  // load second 16 columns
+  load_buffer_16x16(topR, stride, flipud, fliplr, in1);
+  load_buffer_16x16(botR, stride, flipud, fliplr, in1 + 16);
+}
+
+static void nr_right_shift_32x32_16col(__m256i *in) {
+  int i = 0;
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i sign;
+  while (i < 32) {
+    sign = _mm256_srai_epi16(in[i], 15);
+    in[i] = _mm256_add_epi16(in[i], one);
+    in[i] = _mm256_sub_epi16(in[i], sign);
+    in[i] = _mm256_srai_epi16(in[i], 2);
+    i += 1;
+  }
+}
+
+// Negative rounding
+static void nr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+  nr_right_shift_32x32_16col(in0);
+  nr_right_shift_32x32_16col(in1);
+}
+
+static INLINE void pr_right_shift_32x32_16col(__m256i *in) {
+  int i = 0;
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i sign;
+  while (i < 32) {
+    sign = _mm256_cmpgt_epi16(in[i], zero);
+    in[i] = _mm256_add_epi16(in[i], one);
+    in[i] = _mm256_sub_epi16(in[i], sign);
+    in[i] = _mm256_srai_epi16(in[i], 2);
+    i += 1;
+  }
+}
+
+// Positive rounding
+static INLINE void pr_right_shift_32x32(__m256i *in0, __m256i *in1) {
+  pr_right_shift_32x32_16col(in0);
+  pr_right_shift_32x32_16col(in1);
+}
+
+#if CONFIG_EXT_TX
+static void fidtx32_avx2(__m256i *in0, __m256i *in1) {
+  int i = 0;
+  while (i < 32) {
+    in0[i] = _mm256_slli_epi16(in0[i], 2);
+    in1[i] = _mm256_slli_epi16(in1[i], 2);
+    i += 1;
+  }
+  mm256_transpose_32x32(in0, in1);
+}
+#endif
+
+static INLINE int range_check_dct32x32(const __m256i *in0, const __m256i *in1,
+                                       int row) {
+  __m256i value, bits0, bits1;
+  const __m256i bound = _mm256_set1_epi16((1 << 6) - 1);
+  int flag;
+  int i = 0;
+
+  while (i < row) {
+    value = _mm256_abs_epi16(in0[i]);
+    bits0 = _mm256_cmpgt_epi16(value, bound);
+    value = _mm256_abs_epi16(in1[i]);
+    bits1 = _mm256_cmpgt_epi16(value, bound);
+    bits0 = _mm256_or_si256(bits0, bits1);
+    flag = _mm256_movemask_epi8(bits0);
+    if (flag) return 1;
+    i++;
+  }
+  return 0;
+}
+
+void av1_fht32x32_avx2(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  __m256i in0[32];  // left 32 columns
+  __m256i in1[32];  // right 32 columns
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      if (range_check_dct32x32(in0, in1, 32)) {
+        aom_fdct32x32_avx2(input, output, stride);
+        return;
+      }
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_DCT:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case V_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fdct32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_DCT:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fdct32_avx2(in0, in1);
+      break;
+    case V_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_ADST:
+      load_buffer_32x32(input, stride, 0, 0, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+    case V_FLIPADST:
+      load_buffer_32x32(input, stride, 1, 0, in0, in1);
+      fhalfright32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fidtx32_avx2(in0, in1);
+      break;
+    case H_FLIPADST:
+      load_buffer_32x32(input, stride, 0, 1, in0, in1);
+      fidtx32_avx2(in0, in1);
+      pr_right_shift_32x32(in0, in1);
+      fhalfright32_avx2(in0, in1);
+      break;
+#endif  // CONFIG_EXT_TX
+    default: assert(0); break;
+  }
+  nr_right_shift_32x32(in0, in1);
+  write_buffer_32x32(in0, in1, output);
+  _mm256_zeroupper();
+}

diff --git a/av1/encoder/x86/wedge_utils_sse2.c b/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 0000000..35e8493
--- /dev/null
+++ b/av1/encoder/x86/wedge_utils_sse2.c

@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+                                           const uint8_t *m, int N) {
+  int n = -N;
+  int n8 = n + 8;
+
+  uint64_t csse;
+
+  const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+  const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+
+  __m128i v_acc0_q = _mm_setzero_si128();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m128i v_r0_w = xx_load_128(r1 + n);
+    const __m128i v_r1_w = xx_load_128(r1 + n8);
+    const __m128i v_d0_w = xx_load_128(d + n);
+    const __m128i v_d1_w = xx_load_128(d + n8);
+    const __m128i v_m01_b = xx_load_128(m + n);
+
+    const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+    const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+    const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+    const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+    const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+    const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+    const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+    const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+    const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+    const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+    const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+    const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq0_d, 32));
+    const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq1_d, 32));
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+    n8 += 16;
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  xx_storel_64(&csse, v_acc0_q);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+                                       int N, int64_t limit) {
+  int64_t acc;
+
+  __m128i v_sign_d;
+  __m128i v_acc0_d = _mm_setzero_si128();
+  __m128i v_acc1_d = _mm_setzero_si128();
+  __m128i v_acc_q;
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_m01_b = xx_load_128(m);
+    const __m128i v_m23_b = xx_load_128(m + 16);
+    const __m128i v_m45_b = xx_load_128(m + 32);
+    const __m128i v_m67_b = xx_load_128(m + 48);
+
+    const __m128i v_d0_w = xx_load_128(ds);
+    const __m128i v_d1_w = xx_load_128(ds + 8);
+    const __m128i v_d2_w = xx_load_128(ds + 16);
+    const __m128i v_d3_w = xx_load_128(ds + 24);
+    const __m128i v_d4_w = xx_load_128(ds + 32);
+    const __m128i v_d5_w = xx_load_128(ds + 40);
+    const __m128i v_d6_w = xx_load_128(ds + 48);
+    const __m128i v_d7_w = xx_load_128(ds + 56);
+
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+    const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+    const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+    const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+    const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+    const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+    const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+    const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+    const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+    const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+    const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+    const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+    const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+    const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+    const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+    const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+    v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+    v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+  v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+  v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+                           _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+  v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  xx_storel_64(&acc, v_acc_q);
+#endif
+
+  return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+  return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+                                          const int16_t *b, int N) {
+  const __m128i v_neg_w =
+      _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_a0_w = xx_load_128(a);
+    const __m128i v_b0_w = xx_load_128(b);
+    const __m128i v_a1_w = xx_load_128(a + 8);
+    const __m128i v_b1_w = xx_load_128(b + 8);
+    const __m128i v_a2_w = xx_load_128(a + 16);
+    const __m128i v_b2_w = xx_load_128(b + 16);
+    const __m128i v_a3_w = xx_load_128(a + 24);
+    const __m128i v_b3_w = xx_load_128(b + 24);
+
+    const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+    const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+    const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+    const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+    const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+    const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+    const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+    const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+    const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+    xx_store_128(d, v_r0_w);
+    xx_store_128(d + 8, v_r1_w);
+    xx_store_128(d + 16, v_r2_w);
+    xx_store_128(d + 24, v_r3_w);
+
+    a += 32;
+    b += 32;
+    d += 32;
+    N -= 32;
+  } while (N);
+}

diff --git a/build/make/Makefile b/build/make/Makefile
index 1a14f10..481398f 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile

@@ -8,6 +8,7 @@
 ## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 ##
 
+
 include config.mk
 quiet?=true
 ifeq ($(target),)
@@ -25,7 +26,7 @@
 testdata:: .DEFAULT
 utiltest: .DEFAULT
 exampletest-no-data-check utiltest-no-data-check: .DEFAULT
-
+test_%: .DEFAULT ;
 
 # Note: md5sum is not installed on OS X, but openssl is. Openssl may not be
 # installed on cygwin, so we need to autodetect here.
@@ -118,29 +119,25 @@
 test-no-data-check::
 exampletest-no-data-check utiltest-no-data-check:
 
-# Add compiler flags for intrinsic files
+# Force to realign stack always on OS/2
 ifeq ($(TOOLCHAIN), x86-os2-gcc)
-STACKREALIGN=-mstackrealign
-else
-STACKREALIGN=
+CFLAGS += -mstackrealign
 endif
 
 $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
 $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
-$(BUILD_PFX)%av1_reconintra.c.d: CFLAGS += $(STACKREALIGN)
-$(BUILD_PFX)%av1_reconintra.c.o: CFLAGS += $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
@@ -451,3 +448,5 @@
 install:: $(INSTALL_TARGETS)
 dist: $(INSTALL_TARGETS)
 test::
+
+.SUFFIXES:  # Delete default suffix rules

diff --git a/build/make/configure.sh b/build/make/configure.sh
index cc3dcfa..6d9b215 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -185,6 +185,7 @@
 #
 # Boolean Manipulation Functions
 #
+
 enable_feature(){
   set_all yes $*
 }
@@ -201,6 +202,20 @@
   eval test "x\$$1" = "xno"
 }
 
+enable_codec(){
+  enabled "${1}" || echo "  enabling ${1}"
+  enable_feature "${1}"
+
+  is_in "${1}" av1 && enable_feature "${1}_encoder" "${1}_decoder"
+}
+
+disable_codec(){
+  disabled "${1}" || echo "  disabling ${1}"
+  disable_feature "${1}"
+
+  is_in "${1}" av1 && disable_feature "${1}_encoder" "${1}_decoder"
+}
+
 # Iterates through positional parameters, checks to confirm the parameter has
 # not been explicitly (force) disabled, and enables the setting controlled by
 # the parameter when the setting is not disabled.
@@ -389,7 +404,7 @@
 }
 
 write_common_config_banner() {
-  print_aomedia_license config.mk "##" ""
+  print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
   echo "TOOLCHAIN := ${toolchain}" >> config.mk
 
@@ -419,7 +434,7 @@
   saved_CXX="${CXX}"
   enabled ccache && CC="ccache ${CC}"
   enabled ccache && CXX="ccache ${CXX}"
-  print_aomedia_license $1 "##" ""
+  print_webm_license $1 "##" ""
 
   cat >> $1 << EOF
 # This file automatically generated by configure. Do not edit!
@@ -471,7 +486,7 @@
 }
 
 write_common_target_config_h() {
-  print_aomedia_license ${TMP_H} "/*" " */"
+  print_webm_license ${TMP_H} "/*" " */"
   cat >> ${TMP_H} << EOF
 /* This file automatically generated by configure. Do not edit! */
 #ifndef AOM_CONFIG_H
@@ -521,22 +536,20 @@
         ;;
       --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
           [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
         elif [ $action = "disable" ] && ! disabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  disabling $option"
         elif [ $action = "enable" ] && ! enabled $option ; then
-          echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
-            die_unknown $opt
+          is_in ${option} ${CMDLINE_SELECT} || die_unknown $opt
           log_echo "  enabling $option"
         fi
         ${action}_feature $option
         ;;
       --require-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${ARCH_EXT_LIST}; then
             RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
         else
             die_unknown $opt
@@ -638,16 +651,39 @@
   xcrun --sdk $1 --show-sdk-version 2>/dev/null | cut -d. -f1
 }
 
+# Print the Xcode version.
+show_xcode_version() {
+  xcodebuild -version | head -n1 | cut -d' ' -f2
+}
+
+# Fails when Xcode version is less than 6.3.
+check_xcode_minimum_version() {
+  xcode_major=$(show_xcode_version | cut -f1 -d.)
+  xcode_minor=$(show_xcode_version | cut -f2 -d.)
+  xcode_min_major=6
+  xcode_min_minor=3
+  if [ ${xcode_major} -lt ${xcode_min_major} ]; then
+    return 1
+  fi
+  if [ ${xcode_major} -eq ${xcode_min_major} ] \
+    && [ ${xcode_minor} -lt ${xcode_min_minor} ]; then
+    return 1
+  fi
+}
+
 process_common_toolchain() {
   if [ -z "$toolchain" ]; then
     gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
 
     # detect tgt_isa
     case "$gcctarget" in
+      aarch64*)
+        tgt_isa=arm64
+        ;;
       armv6*)
         tgt_isa=armv6
         ;;
-      armv7*-hardfloat*)
+      armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf)
         tgt_isa=armv7
         float_abi=hard
         ;;
@@ -748,7 +784,14 @@
   enabled shared && soft_enable pic
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
-  IOS_VERSION_MIN="6.0"
+  # Shared library framework builds are only possible on iOS 8 and later.
+  if enabled shared; then
+    IOS_VERSION_OPTIONS="--enable-shared"
+    IOS_VERSION_MIN="8.0"
+  else
+    IOS_VERSION_OPTIONS=""
+    IOS_VERSION_MIN="6.0"
+  fi
 
   # Handle darwin variants. Newer SDKs allow targeting older
   # platforms, so use the newest one available.
@@ -877,7 +920,6 @@
 
       case ${tgt_cc} in
         gcc)
-          CROSS=${CROSS:-arm-none-linux-gnueabi-}
           link_with_cc=gcc
           setup_gnu_toolchain
           arch_int=${tgt_isa##armv}
@@ -899,6 +941,9 @@
               check_add_cflags -mfpu=neon #-ftree-vectorize
               check_add_asflags -mfpu=neon
             fi
+          elif [ ${tgt_isa} = "arm64" ] || [ ${tgt_isa} = "armv8" ]; then
+            check_add_cflags -march=armv8-a
+            check_add_asflags -march=armv8-a
           else
             check_add_cflags -march=${tgt_isa}
             check_add_asflags -march=${tgt_isa}
@@ -966,6 +1011,10 @@
           ;;
 
         android*)
+          if [ -z "${sdk_path}" ]; then
+            die "Must specify --sdk-path for Android builds."
+          fi
+
           SDK_PATH=${sdk_path}
           COMPILER_LOCATION=`find "${SDK_PATH}" \
                              -name "arm-linux-androideabi-gcc*" -print -quit`
@@ -1016,18 +1065,7 @@
           NM="$(${XCRUN_FIND} nm)"
           RANLIB="$(${XCRUN_FIND} ranlib)"
           AS_SFX=.s
-
-          # Special handling of ld for armv6 because libclang_rt.ios.a does
-          # not contain armv6 support in Apple's clang package:
-          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
-          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
-          # renders support for armv6 unnecessary because the 3GS and up
-          # support neon.
-          if [ "${tgt_isa}" = "armv6" ]; then
-            LD="$(${XCRUN_FIND} ld)"
-          else
-            LD="${CXX:-$(${XCRUN_FIND} ld)}"
-          fi
+          LD="${CXX:-$(${XCRUN_FIND} ld)}"
 
           # ASFLAGS is written here instead of using check_add_asflags
           # because we need to overwrite all of ASFLAGS and purge the
@@ -1053,6 +1091,19 @@
             [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
           done
 
+          case ${tgt_isa} in
+            armv7|armv7s|armv8|arm64)
+              if enabled neon && ! check_xcode_minimum_version; then
+                soft_disable neon
+                log_echo "  neon disabled: upgrade Xcode (need v6.3+)."
+                if enabled neon_asm; then
+                  soft_disable neon_asm
+                  log_echo "  neon_asm disabled: upgrade Xcode (need v6.3+)."
+                fi
+              fi
+              ;;
+          esac
+
           asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
 
           if [ "$(show_darwin_sdk_major_version iphoneos)" -gt 8 ]; then
@@ -1067,7 +1118,7 @@
           if enabled rvct; then
             # Check if we have CodeSourcery GCC in PATH. Needed for
             # libraries
-            hash arm-none-linux-gnueabi-gcc 2>&- || \
+            which arm-none-linux-gnueabi-gcc 2>&- || \
               die "Couldn't find CodeSourcery GCC from PATH"
 
             # Use armcc as a linker to enable translation of
@@ -1102,13 +1153,13 @@
       if [ -n "${tune_cpu}" ]; then
         case ${tune_cpu} in
           p5600)
-            check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
+            check_add_cflags -mips32r5 -mload-store-pairs
             check_add_cflags -msched-weight -mhard-float -mfp64
             check_add_asflags -mips32r5 -mhard-float -mfp64
             check_add_ldflags -mfp64
             ;;
-          i6400)
-            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+          i6400|p6600)
+            check_add_cflags -mips64r6 -mabi=64 -msched-weight
             check_add_cflags  -mload-store-pairs -mhard-float -mfp64
             check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
             check_add_ldflags -mips64r6 -mabi=64 -mfp64
@@ -1135,7 +1186,7 @@
           CC=${CC:-${CROSS}gcc}
           CXX=${CXX:-${CROSS}g++}
           LD=${LD:-${CROSS}gcc}
-          CROSS=${CROSS:-g}
+          CROSS=${CROSS-g}
           ;;
         os2)
           disable_feature pic
@@ -1188,6 +1239,12 @@
               soft_disable avx2
               ;;
           esac
+          case $vc_version in
+            7|8|9)
+              echo "${tgt_cc} omits stdint.h, disabling webm-io..."
+              soft_disable webm_io
+              ;;
+          esac
           ;;
       esac
 
@@ -1339,10 +1396,6 @@
     fi
   fi
 
-  if [ "${tgt_isa}" = "x86_64" ] || [ "${tgt_isa}" = "x86" ]; then
-    soft_enable use_x86inc
-  fi
-
   # Position Independent Code (PIC) support, for building relocatable
   # shared objects
   enabled gcc && enabled pic && check_add_cflags -fPIC
@@ -1446,22 +1499,20 @@
   done
 }
 
-print_aomedia_license() {
+print_webm_license() {
   saved_prefix="${prefix}"
   destination=$1
   prefix="$2"
   suffix="$3"
   shift 3
   cat <<EOF > ${destination}
-
-${prefix} Copyright (c) 2016, Alliance for Open Media. All rights reserved${suffix}
+${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
 ${prefix} ${suffix}
-${prefix} This source code is subject to the terms of the BSD 2 Clause License and${suffix}
-${prefix} the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License${suffix}
-${prefix} was not distributed with this source code in the LICENSE file, you can${suffix}
-${prefix} obtain it at www.aomedia.org/license/software. If the Alliance for Open${suffix}
-${prefix} Media Patent License 1.0 was not distributed with this source code in the${suffix}
-${prefix} PATENTS file, you can obtain it at www.aomedia.org/license/patent.${suffix}
+${prefix} Use of this source code is governed by a BSD-style license${suffix}
+${prefix} that can be found in the LICENSE file in the root of the source${suffix}
+${prefix} tree. An additional intellectual property rights grant can be found${suffix}
+${prefix} in the file PATENTS.  All contributing project authors may${suffix}
+${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
 EOF
   prefix="${saved_prefix}"
 }

diff --git a/build/make/ios-Info.plist b/build/make/ios-Info.plist
new file mode 100644
index 0000000..300e3e3
--- /dev/null
+++ b/build/make/ios-Info.plist

@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>CFBundleDevelopmentRegion</key>
+	<string>en</string>
+	<key>CFBundleExecutable</key>
+	<string>AOM</string>
+	<key>CFBundleIdentifier</key>
+	<string>org.webmproject.AOM</string>
+	<key>CFBundleInfoDictionaryVersion</key>
+	<string>6.0</string>
+	<key>CFBundleName</key>
+	<string>AOM</string>
+	<key>CFBundlePackageType</key>
+	<string>FMWK</string>
+	<key>CFBundleShortVersionString</key>
+	<string>${VERSION}</string>
+	<key>CFBundleSignature</key>
+	<string>????</string>
+	<key>CFBundleSupportedPlatforms</key>
+	<array>
+		<string>iPhoneOS</string>
+	</array>
+	<key>CFBundleVersion</key>
+	<string>${VERSION}</string>
+	<key>MinimumOSVersion</key>
+	<string>${IOS_VERSION_MIN}</string>
+	<key>UIDeviceFamily</key>
+	<array>
+		<integer>1</integer>
+		<integer>2</integer>
+	</array>
+	<key>AOMFullVersion</key>
+	<string>${FULLVERSION}</string>
+</dict>
+</plist>

diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh
index 1600b39..8f7325c 100755
--- a/build/make/iosbuild.sh
+++ b/build/make/iosbuild.sh

@@ -23,6 +23,7 @@
                 --disable-unit-tests"
 DIST_DIR="_dist"
 FRAMEWORK_DIR="AOM.framework"
+FRAMEWORK_LIB="AOM.framework/AOM"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/aom"
 SCRIPT_DIR=$(dirname "$0")
 LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
@@ -38,7 +39,7 @@
 TARGETS="${ARM_TARGETS} ${SIM_TARGETS}"
 
 # Configures for the target specified by $1, and invokes make with the dist
-# target using $DIST_DIR as the distribution output directory.
+# target using $ as the distribution output directory.
 build_target() {
   local target="$1"
   local old_pwd="$(pwd)"
@@ -102,14 +103,13 @@
   local include_guard="AOM_FRAMEWORK_HEADERS_AOM_AOM_CONFIG_H_"
 
   local file_header="/*
- * Copyright (c) $(date +%Y) Alliance for Open Media. All rights reserved
+ *  Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved.
  *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 /* GENERATED FILE: DO NOT EDIT! */
@@ -137,6 +137,44 @@
   printf "#endif  // ${include_guard}" >> "${config_file}"
 }
 
+# Verifies that $FRAMEWORK_LIB fat library contains requested builds.
+verify_framework_targets() {
+  local requested_cpus=""
+  local cpu=""
+
+  # Extract CPU from full target name.
+  for target; do
+    cpu="${target%%-*}"
+    if [ "${cpu}" = "x86" ]; then
+      # lipo -info outputs i386 for libaom x86 targets.
+      cpu="i386"
+    fi
+    requested_cpus="${requested_cpus}${cpu} "
+  done
+
+  # Get target CPUs present in framework library.
+  local targets_built=$(${LIPO} -info ${FRAMEWORK_LIB})
+
+  # $LIPO -info outputs a string like the following:
+  #   Architectures in the fat file: $FRAMEWORK_LIB <architectures>
+  # Capture only the architecture strings.
+  targets_built=${targets_built##*: }
+
+  # Sort CPU strings to make the next step a simple string compare.
+  local actual=$(echo ${targets_built} | tr " " "\n" | sort | tr "\n" " ")
+  local requested=$(echo ${requested_cpus} | tr " " "\n" | sort | tr "\n" " ")
+
+  vlog "Requested ${FRAMEWORK_LIB} CPUs: ${requested}"
+  vlog "Actual ${FRAMEWORK_LIB} CPUs: ${actual}"
+
+  if [ "${requested}" != "${actual}" ]; then
+    elog "Actual ${FRAMEWORK_LIB} targets do not match requested target list."
+    elog "  Requested target CPUs: ${requested}"
+    elog "  Actual target CPUs: ${actual}"
+    return 1
+  fi
+}
+
 # Configures and builds each target specified by $1, and then builds
 # AOM.framework.
 build_framework() {
@@ -157,7 +195,12 @@
   for target in ${targets}; do
     build_target "${target}"
     target_dist_dir="${BUILD_ROOT}/${target}/${DIST_DIR}"
-    lib_list="${lib_list} ${target_dist_dir}/lib/libaom.a"
+    if [ "${ENABLE_SHARED}" = "yes" ]; then
+      local suffix="dylib"
+    else
+      local suffix="a"
+    fi
+    lib_list="${lib_list} ${target_dist_dir}/lib/libaom.${suffix}"
   done
 
   cd "${ORIG_PWD}"
@@ -176,13 +219,25 @@
   # Copy in aom_version.h.
   cp -p "${BUILD_ROOT}/${target}/aom_version.h" "${HEADER_DIR}"
 
-  vlog "Created fat library ${FRAMEWORK_DIR}/AOM containing:"
+  if [ "${ENABLE_SHARED}" = "yes" ]; then
+    # Adjust the dylib's name so dynamic linking in apps works as expected.
+    install_name_tool -id '@rpath/AOM.framework/AOM' ${FRAMEWORK_DIR}/AOM
+
+    # Copy in Info.plist.
+    cat "${SCRIPT_DIR}/ios-Info.plist" \
+      | sed "s/\${FULLVERSION}/${FULLVERSION}/g" \
+      | sed "s/\${VERSION}/${VERSION}/g" \
+      | sed "s/\${IOS_VERSION_MIN}/${IOS_VERSION_MIN}/g" \
+      > "${FRAMEWORK_DIR}/Info.plist"
+  fi
+
+  # Confirm AOM.framework/AOM contains the targets requested.
+  verify_framework_targets ${targets}
+
+  vlog "Created fat library ${FRAMEWORK_LIB} containing:"
   for lib in ${lib_list}; do
     vlog "  $(echo ${lib} | awk -F / '{print $2, $NF}')"
   done
-
-  # TODO(tomfinegan): Verify that expected targets are included within
-  # AOM.framework/AOM via lipo -info.
 }
 
 # Trap function. Cleans up the subtree used to build all targets contained in
@@ -213,6 +268,7 @@
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
+    --enable-shared: Build a dynamic framework for use on iOS 8 or later.
     --extra-configure-args <args>: Extra args to pass when configuring libaom.
     --macosx: Uses darwin15 targets instead of iphonesimulator targets for x86
               and x86_64. Allows linking to framework when builds target MacOSX
@@ -251,6 +307,9 @@
       iosbuild_usage
       exit
       ;;
+    --enable-shared)
+      ENABLE_SHARED=yes
+      ;;
     --preserve-build-output)
       PRESERVE_BUILD_OUTPUT=yes
       ;;
@@ -278,6 +337,21 @@
   shift
 done
 
+if [ "${ENABLE_SHARED}" = "yes" ]; then
+  CONFIGURE_ARGS="--enable-shared ${CONFIGURE_ARGS}"
+fi
+
+FULLVERSION=$("${SCRIPT_DIR}"/version.sh --bare "${LIBAOM_SOURCE_DIR}")
+VERSION=$(echo "${FULLVERSION}" | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*$/\1/')
+
+if [ "$ENABLE_SHARED" = "yes" ]; then
+  IOS_VERSION_OPTIONS="--enable-shared"
+  IOS_VERSION_MIN="8.0"
+else
+  IOS_VERSION_OPTIONS=""
+  IOS_VERSION_MIN="6.0"
+fi
+
 if [ "${VERBOSE}" = "yes" ]; then
 cat << EOF
   BUILD_ROOT=${BUILD_ROOT}
@@ -285,6 +359,7 @@
   CONFIGURE_ARGS=${CONFIGURE_ARGS}
   EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
   FRAMEWORK_DIR=${FRAMEWORK_DIR}
+  FRAMEWORK_LIB=${FRAMEWORK_LIB}
   HEADER_DIR=${HEADER_DIR}
   LIBAOM_SOURCE_DIR=${LIBAOM_SOURCE_DIR}
   LIPO=${LIPO}
@@ -292,8 +367,13 @@
   ORIG_PWD=${ORIG_PWD}
   PRESERVE_BUILD_OUTPUT=${PRESERVE_BUILD_OUTPUT}
   TARGETS="$(print_list "" ${TARGETS})"
+  ENABLE_SHARED=${ENABLE_SHARED}
   OSX_TARGETS="${OSX_TARGETS}"
   SIM_TARGETS="${SIM_TARGETS}"
+  SCRIPT_DIR="${SCRIPT_DIR}"
+  FULLVERSION="${FULLVERSION}"
+  VERSION="${VERSION}"
+  IOS_VERSION_MIN="${IOS_VERSION_MIN}"
 EOF
 fi
 

diff --git a/codereview.settings b/codereview.settings
index cc1d867..d7c8d39 100644
--- a/codereview.settings
+++ b/codereview.settings

@@ -1,4 +1,4 @@
 # This file is used by gcl to get repository specific information.
-GERRIT_HOST: aomedia-review.googlesource.com
+GERRIT_HOST: chromium-review.googlesource.com
 GERRIT_PORT: 29418
-CODE_REVIEW_SERVER: aomedia-review.googlesource.com
+CODE_REVIEW_SERVER: chromium-review.googlesource.com

diff --git a/configure b/configure
index 7f867e5..75a8844 100755
--- a/configure
+++ b/configure

@@ -35,8 +35,13 @@
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
   ${toggle_aom_highbitdepth}      use high bit depth (10/12) profiles
+  ${toggle_better_hw_compatibility}
+                                  enable encoder to produce streams with better
+                                  hardware decoder compatibility
   ${toggle_av1}                  AV1 codec support
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
+  ${toggle_postproc}              postprocessing
+  ${toggle_av1_postproc}          av1 specific postprocessing
   ${toggle_multithread}           multithreaded encoding and decoding
   ${toggle_spatial_resampling}    spatial sampling (scaling) support
   ${toggle_realtime_only}         enable this option while building for real-time encoding
@@ -49,8 +54,11 @@
   ${toggle_shared}                shared library support
   ${toggle_static}                static library support
   ${toggle_small}                 favor smaller size over speed
+  ${toggle_postproc_visualizer}   macro block / block level visualizers
   ${toggle_multi_res_encoding}    enable multiple-resolution encoding
   ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
+  ${toggle_av1_temporal_denoising}
+                                  enable av1 temporal denoising
   ${toggle_webm_io}               enable input from and output to WebM container
   ${toggle_libyuv}                enable libyuv
   ${toggle_accounting}            enable bit accounting
@@ -89,11 +97,11 @@
 
 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv6-darwin-gcc"
+all_platforms="${all_platforms} arm64-darwin-gcc"
+all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
 all_platforms="${all_platforms} armv6-none-rvct"
-all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -103,6 +111,7 @@
 all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
+all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
@@ -181,7 +190,7 @@
 fi
 
 # disable codecs when their source directory does not exist
-[ -d "${source_path}/av1" ] || disable_feature av1
+[ -d "${source_path}/av1" ] || disable_codec av1
 
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
@@ -242,35 +251,51 @@
     unistd_h
 "
 EXPERIMENT_LIST="
-    spatial_svc
     fp_mb_stats
     emulate_hardware
     clpf
     dering
+    var_tx
+    rect_tx
     ref_mv
+    dual_filter
+    ext_tx
+    tx64x64
     sub8x8_mc
     ext_intra
+    filter_intra
+    ext_inter
     ext_interp
-    ext_tx
-    motion_var
     ext_refs
-    ext_compound
+    global_motion
+    new_quant
     supertx
     ans
+    rans
     ec_multisymbol
+    loop_restoration
+    ext_partition
+    ext_partition_types
+    ext_tile
+    motion_var
+    warped_motion
+    entropy
+    bidir_pred
+    bitstream_debug
+    alt_intra
+    palette
     daala_ec
     pvq
-    parallel_deblocking
     cb4x4
-    palette
     frame_size
-    filter_7bit
     delta_q
     adapt_scan
-    bitstream_debug
+    filter_7bit
+    parallel_deblocking
     tile_groups
     ec_adapt
     simp_mv_pred
+    rd_debug
 "
 CONFIG_LIST="
     dependency_tracking
@@ -279,7 +304,6 @@
     install_bins
     install_libs
     install_srcs
-    use_x86inc
     debug
     gprof
     gcov
@@ -295,6 +319,8 @@
     dequant_tokens
     dc_recon
     runtime_cpu_detect
+    postproc
+    av1_postproc
     multithread
     internal_stats
     ${CODECS}
@@ -309,6 +335,7 @@
     shared
     static
     small
+    postproc_visualizer
     os_support
     unit_tests
     webm_io
@@ -318,8 +345,10 @@
     encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    av1_temporal_denoising
     coefficient_range_checking
     aom_highbitdepth
+    better_hw_compatibility
     experimental
     size_limit
     aom_qm
@@ -338,7 +367,6 @@
     gprof
     gcov
     pic
-    use_x86inc
     optimizations
     ccache
     runtime_cpu_detect
@@ -355,6 +383,8 @@
 
     dequant_tokens
     dc_recon
+    postproc
+    av1_postproc
     multithread
     internal_stats
     ${CODECS}
@@ -367,6 +397,7 @@
     shared
     static
     small
+    postproc_visualizer
     unit_tests
     webm_io
     libyuv
@@ -375,7 +406,9 @@
     encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    av1_temporal_denoising
     coefficient_range_checking
+    better_hw_compatibility
     aom_highbitdepth
     experimental
     aom_qm
@@ -386,15 +419,19 @@
     for opt do
         optval="${opt#*=}"
         case "$opt" in
-        --disable-codecs) for c in ${CODECS}; do disable_feature $c; done ;;
+        --disable-codecs)
+          for c in ${CODEC_FAMILIES}; do disable_codec $c; done
+          ;;
         --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
-        if echo "${EXPERIMENT_LIST}" | grep "^ *$option\$" >/dev/null; then
+        if is_in ${option} ${EXPERIMENT_LIST}; then
             if enabled experimental; then
                 ${action}_feature $option
             else
                 log_echo "Ignoring $opt -- not in experimental mode."
             fi
+        elif is_in ${option} "${CODECS} ${CODEC_FAMILIES}"; then
+            ${action}_codec ${option}
         else
             process_common_cmdline $opt
         fi
@@ -408,14 +445,6 @@
 post_process_cmdline() {
     c=""
 
-    # If the codec family is disabled, disable all components of that family.
-    # If the codec family is enabled, enable all components of that family.
-    log_echo "Configuring selected codecs"
-    for c in ${CODECS}; do
-        disabled ${c%%_*} && disable_feature ${c}
-        enabled ${c%%_*} && enable_feature ${c}
-    done
-
     # Enable all detected codecs, if they haven't been disabled
     for c in ${CODECS}; do soft_enable $c; done
 
@@ -437,6 +466,11 @@
     enabled ec_multisymbol && ! enabled ans && soft_enable daala_ec
     enabled ec_multisymbol && ! enabled daala_ec && soft_enable ans
     enabled daala_ec && enable_feature ec_multisymbol
+    if enabled global_motion && (enabled ext_inter || enabled dual_filter); then
+      log_echo "global_motion currently not compatible with ext_inter"
+      log_echo "and dual_filter. Disabling global_motion."
+      disable_feature global_motion
+    fi
 }
 
 
@@ -459,6 +493,7 @@
     done
     enabled debug_libs && DIST_DIR="${DIST_DIR}-debug"
     enabled codec_srcs && DIST_DIR="${DIST_DIR}-src"
+    ! enabled postproc && ! enabled av1_postproc && DIST_DIR="${DIST_DIR}-nopost"
     ! enabled multithread && DIST_DIR="${DIST_DIR}-nomt"
     ! enabled install_docs && DIST_DIR="${DIST_DIR}-nodocs"
     DIST_DIR="${DIST_DIR}-${tgt_isa}-${tgt_os}"
@@ -518,13 +553,18 @@
         # Can only build shared libs on a subset of platforms. Doing this check
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
-        if ! enabled linux && ! enabled os2; then
+        case "${tgt_os}" in
+        linux|os2|darwin*|iphonesimulator*)
+            # Supported platforms
+            ;;
+        *)
             if enabled gnu; then
                 echo "--enable-shared is only supported on ELF; assuming this is OK"
             else
-                die "--enable-shared only supported on ELF and OS/2 for now"
+                die "--enable-shared only supported on ELF, OS/2, and Darwin for now"
             fi
-        fi
+            ;;
+        esac
     fi
     if [ -z "$CC" ] || enabled external_build; then
         echo "Bypassing toolchain for environment detection."
@@ -587,6 +627,7 @@
         check_add_cflags -Wall
         check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
+        check_add_cflags -Wfloat-conversion
         check_add_cflags -Wpointer-arith
         check_add_cflags -Wtype-limits
         check_add_cflags -Wvla
@@ -594,6 +635,7 @@
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused
         check_add_cflags -Wsign-compare
+        check_add_cflags -Wlogical-op
         # Enabling the following warning (in combination with -Wunused above)
         # for C++ generates errors in third_party code including googletest and
         # libyuv. So enable it only for C code.
@@ -602,14 +644,6 @@
         # about some function parameters shadowing class member function names.
         # So, only enable this warning for C code.
         check_cflags "-Wshadow" && add_cflags_only "-Wshadow"
-        case ${CC} in
-          *clang*)
-              # libaom and/or clang have issues with aliasing:
-              # https://code.google.com/p/webm/issues/detail?id=603
-              # work around them until they are fixed
-              check_add_cflags -fno-strict-aliasing
-          ;;
-        esac
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
@@ -664,6 +698,13 @@
         ;;
     esac
 
+    # Other toolchain specific defaults
+    case $toolchain in x86*) soft_enable postproc;; esac
+
+    if enabled postproc_visualizer; then
+        enabled postproc || die "postproc_visualizer requires postproc to be enabled"
+    fi
+
     # Enable unit tests by default if we have a working C++ compiler.
     case "$toolchain" in
         *-vs*)
@@ -730,7 +771,7 @@
 ##
 CONFIGURE_ARGS="$@"
 process "$@"
-print_aomedia_license ${BUILD_PFX}aom_config.c "/*" " */"
+print_webm_license ${BUILD_PFX}aom_config.c "/*" " */"
 cat <<EOF >> ${BUILD_PFX}aom_config.c
 #include "aom/aom_codec.h"
 static const char* const cfg = "$CONFIGURE_ARGS";

diff --git a/examples/aom_cx_set_ref.c b/examples/aom_cx_set_ref.c
index 74fdec0..6beb4fb 100644
--- a/examples/aom_cx_set_ref.c
+++ b/examples/aom_cx_set_ref.c

@@ -278,12 +278,12 @@
 int main(int argc, char **argv) {
   FILE *infile = NULL;
   // Encoder
-  aom_codec_ctx_t ecodec = { 0 };
-  aom_codec_enc_cfg_t cfg = { 0 };
+  aom_codec_ctx_t ecodec;
+  aom_codec_enc_cfg_t cfg;
   unsigned int frame_in = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
 
@@ -309,6 +309,12 @@
   unsigned int limit = 0;
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&ecodec, 0, sizeof(ecodec));
+  memset(&cfg, 0, sizeof(cfg));
+  memset(&info, 0, sizeof(info));
+
   if (argc < 7) die("Invalid number of arguments");
 
   codec_arg = argv[1];

diff --git a/examples/lossless_encoder.c b/examples/lossless_encoder.c
index 069e35e..1abeb27 100644
--- a/examples/lossless_encoder.c
+++ b/examples/lossless_encoder.c

@@ -63,13 +63,17 @@
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
   const int fps = 30;
 
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
   if (argc < 5) die("Invalid number of arguments");
 
   encoder = get_aom_encoder_by_name("av1");

diff --git a/examples/resize_util.c b/examples/resize_util.c
new file mode 100644
index 0000000..4f56885
--- /dev/null
+++ b/examples/resize_util.c

@@ -0,0 +1,123 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../tools_common.h"
+#include "../av1/encoder/av1_resize.h"
+
+static const char *exec_name = NULL;
+
+static void usage() {
+  printf("Usage:\n");
+  printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
+         exec_name);
+  printf("<output_yuv> [<frames>]\n");
+}
+
+void usage_exit(void) {
+  usage();
+  exit(EXIT_FAILURE);
+}
+
+static int parse_dim(char *v, int *width, int *height) {
+  char *x = strchr(v, 'x');
+  if (x == NULL) x = strchr(v, 'X');
+  if (x == NULL) return 0;
+  *width = atoi(v);
+  *height = atoi(&x[1]);
+  if (*width <= 0 || *height <= 0)
+    return 0;
+  else
+    return 1;
+}
+
+int main(int argc, char *argv[]) {
+  char *fin, *fout;
+  FILE *fpin, *fpout;
+  uint8_t *inbuf, *outbuf;
+  uint8_t *inbuf_u, *outbuf_u;
+  uint8_t *inbuf_v, *outbuf_v;
+  int f, frames;
+  int width, height, target_width, target_height;
+
+  exec_name = argv[0];
+
+  if (argc < 5) {
+    printf("Incorrect parameters:\n");
+    usage();
+    return 1;
+  }
+
+  fin = argv[1];
+  fout = argv[4];
+  if (!parse_dim(argv[2], &width, &height)) {
+    printf("Incorrect parameters: %s\n", argv[2]);
+    usage();
+    return 1;
+  }
+  if (!parse_dim(argv[3], &target_width, &target_height)) {
+    printf("Incorrect parameters: %s\n", argv[3]);
+    usage();
+    return 1;
+  }
+
+  fpin = fopen(fin, "rb");
+  if (fpin == NULL) {
+    printf("Can't open file %s to read\n", fin);
+    usage();
+    return 1;
+  }
+  fpout = fopen(fout, "wb");
+  if (fpout == NULL) {
+    printf("Can't open file %s to write\n", fout);
+    usage();
+    return 1;
+  }
+  if (argc >= 6)
+    frames = atoi(argv[5]);
+  else
+    frames = INT_MAX;
+
+  printf("Input size:  %dx%d\n", width, height);
+  printf("Target size: %dx%d, Frames: ", target_width, target_height);
+  if (frames == INT_MAX)
+    printf("All\n");
+  else
+    printf("%d\n", frames);
+
+  inbuf = (uint8_t *)malloc(width * height * 3 / 2);
+  outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
+  inbuf_u = inbuf + width * height;
+  inbuf_v = inbuf_u + width * height / 4;
+  outbuf_u = outbuf + target_width * target_height;
+  outbuf_v = outbuf_u + target_width * target_height / 4;
+  f = 0;
+  while (f < frames) {
+    if (fread(inbuf, width * height * 3 / 2, 1, fpin) != 1) break;
+    av1_resize_frame420(inbuf, width, inbuf_u, inbuf_v, width / 2, height,
+                        width, outbuf, target_width, outbuf_u, outbuf_v,
+                        target_width / 2, target_height, target_width);
+    fwrite(outbuf, target_width * target_height * 3 / 2, 1, fpout);
+    f++;
+  }
+  printf("%d frames processed\n", f);
+  fclose(fpin);
+  fclose(fpout);
+
+  free(inbuf);
+  free(outbuf);
+  return 0;
+}

diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c
index 418757d..1d2b51e 100644
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c

@@ -151,7 +151,7 @@
   int frame_count = 0;
   aom_image_t raw;
   aom_codec_err_t res;
-  AvxVideoInfo info = { 0 };
+  AvxVideoInfo info;
   AvxVideoWriter *writer = NULL;
   const AvxInterface *encoder = NULL;
   const int fps = 30;
@@ -168,6 +168,10 @@
 
   exec_name = argv[0];
 
+  // Clear explicitly, as simply assigning "{ 0 }" generates
+  // "missing-field-initializers" warning in some compilers.
+  memset(&info, 0, sizeof(info));
+
   if (argc != 9) die("Invalid number of arguments");
 
   codec_arg = argv[1];

diff --git a/ivfdec.c b/ivfdec.c
index 933b212..4182e11 100644
--- a/ivfdec.c
+++ b/ivfdec.c

@@ -24,7 +24,7 @@
   // we can guess the framerate using only the timebase in this
   // case. Other files would require reading ahead to guess the
   // timebase, like we do for webm.
-  if (*num < 1000) {
+  if (*den > 0 && *den < 1000000000 && *num > 0 && *num < 1000) {
     // Correct for the factor of 2 applied to the timebase in the encoder.
     if (*num & 1)
       *den *= 2;

diff --git a/libs.mk b/libs.mk
index ba950c2..d4a3040 100644
--- a/libs.mk
+++ b/libs.mk

@@ -10,7 +10,6 @@
 ##
 
 
-
 # ARM assembly files are written in RVCT-style. We use some make magic to
 # filter those files to allow GCC compilation
 ifeq ($(ARCH_ARM),yes)
@@ -68,7 +67,6 @@
   CODEC_EXPORTS-yes += $(addprefix $(AV1_PREFIX),$(AV1_CX_EXPORTS))
   CODEC_SRCS-yes += $(AV1_PREFIX)av1_cx.mk aom/aom.h aom/aomcx.h
   INSTALL-LIBS-yes += include/aom/aom.h include/aom/aomcx.h
-  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/aom/svc_context.h
   INSTALL_MAPS += include/aom/% $(SRC_PATH_BARE)/$(AV1_PREFIX)/%
   CODEC_DOC_SRCS += aom/aom.h aom/aomcx.h
   CODEC_DOC_SECTIONS += av1 av1_encoder
@@ -216,6 +214,12 @@
 LIBAOM_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                              libaom.dylib  )
 else
+ifeq ($(filter iphonesimulator%,$(TGT_OS)),$(TGT_OS))
+LIBAOM_SO               := libaom.$(SO_VERSION_MAJOR).dylib
+SHARED_LIB_SUF          := .dylib
+EXPORT_FILE             := libaom.syms
+LIBAOM_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, libaom.dylib)
+else
 ifeq ($(filter os2%,$(TGT_OS)),$(TGT_OS))
 LIBAOM_SO               := libaom$(SO_VERSION_MAJOR).dll
 SHARED_LIB_SUF          := _dll.a
@@ -231,6 +235,7 @@
                              libaom.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
 endif
 endif
+endif
 
 LIBS-$(CONFIG_SHARED) += $(BUILD_PFX)$(LIBAOM_SO)\
                            $(notdir $(LIBAOM_SO_SYMLINKS)) \
@@ -257,7 +262,7 @@
 	$(qexec)echo LIBRARY $(LIBAOM_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
 	$(qexec)echo "DATA MULTIPLE NONSHARED" >> $@
 	$(qexec)echo "EXPORTS" >> $@
-	$(qexec)awk '!/aom_svc_*/ {print "_"$$2}' $^ >>$@
+	$(qexec)awk '{print "_"$$2}' $^ >>$@
 CLEAN-OBJS += libaom.def
 
 libaom_dll.a: $(LIBAOM_SO)
@@ -295,7 +300,7 @@
 	$(qexec)echo 'includedir=$${prefix}/include' >> $@
 	$(qexec)echo '' >> $@
 	$(qexec)echo 'Name: aom' >> $@
-	$(qexec)echo 'Description: AOMedia Project AVx codec implementation' >> $@
+	$(qexec)echo 'Description: WebM Project AVx codec implementation' >> $@
 	$(qexec)echo 'Version: $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)' >> $@
 	$(qexec)echo 'Requires:' >> $@
 	$(qexec)echo 'Conflicts:' >> $@

diff --git a/test/acm_random.h b/test/acm_random.h
index f7ce6e0..eb2353d 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h

@@ -36,6 +36,12 @@
     return (value >> 15) & 0xffff;
   }
 
+  int16_t Rand9Signed(void) {
+    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
+    const uint32_t value = random_.Generate(512);
+    return static_cast<int16_t>(value) - 256;
+  }
+
   uint8_t Rand8(void) {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);

diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 42b45cc..2d30480 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc

@@ -41,13 +41,19 @@
       encoder->Control(AOME_SET_CPUUSED, cpu_used_);
     } else if (video->frame() == 3) {
       aom_active_map_t map = aom_active_map_t();
+      /* clang-format off */
       uint8_t active_map[9 * 13] = {
-        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
-        0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
-        1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
-        0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
-        0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1,
+        0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
+        0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
+        1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       };
+      /* clang-format on */
       map.cols = (kWidth + 15) / 16;
       map.rows = (kHeight + 15) / 16;
       ASSERT_EQ(map.cols, 13u);
@@ -63,25 +69,35 @@
     }
   }
 
+  void DoTest() {
+    // Validate that this non multiple of 64 wide clip encodes
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_resize_allowed = 0;
+    cfg_.g_pass = AOM_RC_ONE_PASS;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.kf_max_dist = 90000;
+    ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30,
+                                         1, 0, 20);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
   int cpu_used_;
 };
 
-TEST_P(ActiveMapTest, Test) {
-  // Validate that this non multiple of 64 wide clip encodes
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_target_bitrate = 400;
-  cfg_.rc_resize_allowed = 0;
-  cfg_.g_pass = AOM_RC_ONE_PASS;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.kf_max_dist = 90000;
+TEST_P(ActiveMapTest, Test) { DoTest(); }
 
-  ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30, 1,
-                                       0, 20);
+class ActiveMapTestLarge : public ActiveMapTest {};
 
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
+TEST_P(ActiveMapTestLarge, Test) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(ActiveMapTestLarge,
+                          ::testing::Values(::libaom_test::kRealTime),
+                          ::testing::Range(0, 5));
 
 AV1_INSTANTIATE_TEST_CASE(ActiveMapTest,
                           ::testing::Values(::libaom_test::kRealTime),
-                          ::testing::Range(0, 6));
+                          ::testing::Range(5, 9));
+
 }  // namespace

diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
new file mode 100644
index 0000000..768ac36
--- /dev/null
+++ b/test/add_noise_test.cc

@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <math.h>
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/postproc.h"
+#include "aom_mem/aom_mem.h"
+
+namespace {
+
+// TODO(jimbankoski): make width and height integers not unsigned.
+typedef void (*AddNoiseFunc)(unsigned char *start, char *noise,
+                             char blackclamp[16], char whiteclamp[16],
+                             char bothclamp[16], unsigned int width,
+                             unsigned int height, int pitch);
+
+class AddNoiseTest : public ::testing::TestWithParam<AddNoiseFunc> {
+ public:
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+  virtual ~AddNoiseTest() {}
+};
+
+double stddev6(char a, char b, char c, char d, char e, char f) {
+  const double n = (a + b + c + d + e + f) / 6.0;
+  const double v = ((a - n) * (a - n) + (b - n) * (b - n) + (c - n) * (c - n) +
+                    (d - n) * (d - n) + (e - n) * (e - n) + (f - n) * (f - n)) /
+                   6.0;
+  return sqrt(v);
+}
+
+TEST_P(AddNoiseTest, CheckNoiseAdded) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+  const int clamp = aom_setup_noise(4.4, sizeof(noise), noise);
+
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = clamp;
+    whiteclamp[i] = clamp;
+    bothclamp[i] = 2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
+  memset(s, 99, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure we don't end up having either the same or no added
+  // noise either vertically or horizontally.
+  for (int i = 0; i < image_size - 6 * width - 6; ++i) {
+    const double hd = stddev6(s[i] - 99, s[i + 1] - 99, s[i + 2] - 99,
+                              s[i + 3] - 99, s[i + 4] - 99, s[i + 5] - 99);
+    const double vd = stddev6(s[i] - 99, s[i + width] - 99,
+                              s[i + 2 * width] - 99, s[i + 3 * width] - 99,
+                              s[i + 4 * width] - 99, s[i + 5 * width] - 99);
+
+    EXPECT_NE(hd, 0);
+    EXPECT_NE(vd, 0);
+  }
+
+  // Initialize pixels in the image to 255 and check for roll over.
+  memset(s, 255, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll over.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_GT(static_cast<int>(s[i]), clamp) << "i = " << i;
+  }
+
+  // Initialize pixels in the image to 0 and check for roll under.
+  memset(s, 0, image_size);
+
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+
+  // Check to make sure don't roll under.
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_LT(static_cast<int>(s[i]), 255 - clamp) << "i = " << i;
+  }
+
+  aom_free(s);
+}
+
+TEST_P(AddNoiseTest, CheckCvsAssembly) {
+  DECLARE_ALIGNED(16, char, blackclamp[16]);
+  DECLARE_ALIGNED(16, char, whiteclamp[16]);
+  DECLARE_ALIGNED(16, char, bothclamp[16]);
+  const int width = 64;
+  const int height = 64;
+  const int image_size = width * height;
+  char noise[3072];
+
+  const int clamp = aom_setup_noise(4.4, sizeof(noise), noise);
+
+  for (int i = 0; i < 16; i++) {
+    blackclamp[i] = clamp;
+    whiteclamp[i] = clamp;
+    bothclamp[i] = 2 * clamp;
+  }
+
+  uint8_t *const s = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
+  uint8_t *const d = reinterpret_cast<uint8_t *>(aom_calloc(image_size, 1));
+
+  memset(s, 99, image_size);
+  memset(d, 99, image_size);
+
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(GetParam()(s, noise, blackclamp, whiteclamp,
+                                      bothclamp, width, height, width));
+  srand(0);
+  ASM_REGISTER_STATE_CHECK(aom_plane_add_noise_c(
+      d, noise, blackclamp, whiteclamp, bothclamp, width, height, width));
+
+  for (int i = 0; i < image_size; ++i) {
+    EXPECT_EQ(static_cast<int>(s[i]), static_cast<int>(d[i])) << "i = " << i;
+  }
+
+  aom_free(d);
+  aom_free(s);
+}
+
+INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
+                        ::testing::Values(aom_plane_add_noise_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
+                        ::testing::Values(aom_plane_add_noise_sse2));
+#endif
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
+                        ::testing::Values(aom_plane_add_noise_msa));
+#endif
+}  // namespace

diff --git a/test/altref_test.cc b/test/altref_test.cc
new file mode 100644
index 0000000..29d6a63
--- /dev/null
+++ b/test/altref_test.cc

@@ -0,0 +1,95 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+namespace {
+
+class AltRefForcedKeyTestLarge
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int> {
+ protected:
+  AltRefForcedKeyTestLarge()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
+  virtual ~AltRefForcedKeyTestLarge() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_threads = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+#if CONFIG_AV1_ENCODER
+      // override test default for tile columns if necessary.
+      if (GET_PARAM(0) == &libaom_test::kAV1) {
+        encoder->Control(AV1E_SET_TILE_COLUMNS, 6);
+      }
+#endif
+    }
+    frame_flags_ =
+        (video->frame() == forced_kf_frame_num_) ? AOM_EFLAG_FORCE_KF : 0;
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    if (frame_num_ == forced_kf_frame_num_) {
+      ASSERT_TRUE(!!(pkt->data.frame.flags & AOM_FRAME_IS_KEY))
+          << "Frame #" << frame_num_ << " isn't a keyframe!";
+    }
+    ++frame_num_;
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  unsigned int forced_kf_frame_num_;
+  unsigned int frame_num_;
+};
+
+TEST_P(AltRefForcedKeyTestLarge, Frame1IsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  forced_kf_frame_num_ = 1;
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+TEST_P(AltRefForcedKeyTestLarge, ForcedFrameIsKey) {
+  const aom_rational timebase = { 1, 30 };
+  const int lag_values[] = { 3, 15, 25, -1 };
+
+  for (int i = 0; lag_values[i] != -1; ++i) {
+    frame_num_ = 0;
+    forced_kf_frame_num_ = lag_values[i] - 1;
+    cfg_.g_lag_in_frames = lag_values[i];
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       timebase.den, timebase.num, 0, 30);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+}
+
+AV1_INSTANTIATE_TEST_CASE(AltRefForcedKeyTestLarge,
+                          ::testing::Values(::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 9));
+
+}  // namespace

diff --git a/test/android/Android.mk b/test/android/Android.mk
index e5ae293..9ad2a34 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk

@@ -9,7 +9,6 @@
 # PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 #
 # This make file builds aom_test app for android.
-
 # The test app itself runs on the command line through adb shell
 # The paths are really messed up as the libaom make file
 # expects to be made from a parent directory.
@@ -19,13 +18,13 @@
 
 #libwebm
 include $(CLEAR_VARS)
-include $(BINDINGS_DIR)/aom/third_party/libwebm/Android.mk
+include $(BINDINGS_DIR)/libaom/third_party/libwebm/Android.mk
 LOCAL_PATH := $(CUR_WD)/../../..
 
 #libaom
 include $(CLEAR_VARS)
 LOCAL_STATIC_LIBRARIES := libwebm
-include $(BINDINGS_DIR)/aom/build/make/Android.mk
+include $(BINDINGS_DIR)/libaom/build/make/Android.mk
 LOCAL_PATH := $(CUR_WD)/../..
 
 #libgtest

diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 1f3d4b4..e231b4e 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc

@@ -41,69 +41,45 @@
     }
   }
 
+  void DoTest(int aq_mode) {
+    aq_mode_ = aq_mode;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = AOM_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
   int set_cpu_used_;
   int aq_mode_;
 };
 
 // Validate that this AQ segmentation mode (AQ=1, variance_ap)
 // encodes and decodes without a mismatch.
-TEST_P(AqSegmentTest, TestNoMisMatchAQ1) {
-  cfg_.rc_min_quantizer = 8;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_target_bitrate = 300;
-
-  aq_mode_ = 1;
-
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 100);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
+TEST_P(AqSegmentTest, TestNoMisMatchAQ1) { DoTest(1); }
 
 // Validate that this AQ segmentation mode (AQ=2, complexity_aq)
 // encodes and decodes without a mismatch.
-TEST_P(AqSegmentTest, TestNoMisMatchAQ2) {
-  cfg_.rc_min_quantizer = 8;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_target_bitrate = 300;
-
-  aq_mode_ = 2;
-
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 100);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
+TEST_P(AqSegmentTest, TestNoMisMatchAQ2) { DoTest(2); }
 
 // Validate that this AQ segmentation mode (AQ=3, cyclic_refresh_aq)
 // encodes and decodes without a mismatch.
-TEST_P(AqSegmentTest, TestNoMisMatchAQ3) {
-  cfg_.rc_min_quantizer = 8;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_target_bitrate = 300;
+TEST_P(AqSegmentTest, TestNoMisMatchAQ3) { DoTest(3); }
 
-  aq_mode_ = 3;
+class AqSegmentTestLarge : public AqSegmentTest {};
 
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 100);
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ1) { DoTest(1); }
 
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-}
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ2) { DoTest(2); }
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ3) { DoTest(3); }
 
 #if CONFIG_DELTA_Q
 // Validate that this AQ mode (AQ=4, delta q)
@@ -123,5 +99,9 @@
 AV1_INSTANTIATE_TEST_CASE(AqSegmentTest,
                           ::testing::Values(::libaom_test::kRealTime,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(3, 9));
+                          ::testing::Range(5, 9));
+AV1_INSTANTIATE_TEST_CASE(AqSegmentTestLarge,
+                          ::testing::Values(::libaom_test::kRealTime,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(3, 5));
 }  // namespace

diff --git a/test/arf_freq_test.cc b/test/arf_freq_test.cc
index afddfea..26d7f32 100644
--- a/test/arf_freq_test.cc
+++ b/test/arf_freq_test.cc

@@ -75,16 +75,16 @@
     return !strcmp(dot, ".y4m");
 }
 
-class ArfFreqTest
+class ArfFreqTestLarge
     : public ::libaom_test::EncoderTest,
       public ::libaom_test::CodecTestWith3Params<TestVideoParam,
                                                  TestEncodeParam, int> {
  protected:
-  ArfFreqTest()
+  ArfFreqTestLarge()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
 
-  virtual ~ArfFreqTest() {}
+  virtual ~ArfFreqTestLarge() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -180,7 +180,7 @@
   int run_of_visible_frames_;
 };
 
-TEST_P(ArfFreqTest, MinArfFreqTest) {
+TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
   cfg_.rc_target_bitrate = kBitrate;
   cfg_.g_error_resilient = 0;
   cfg_.g_profile = test_video_param_.profile;
@@ -189,26 +189,25 @@
   init_flags_ = AOM_CODEC_USE_PSNR;
   if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
 
-  libaom_test::VideoSource *video;
+  testing::internal::scoped_ptr<libaom_test::VideoSource> video;
   if (is_extension_y4m(test_video_param_.filename)) {
-    video =
-        new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, kFrames);
+    video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
   } else {
-    video = new libaom_test::YUVVideoSource(
+    video.reset(new libaom_test::YUVVideoSource(
         test_video_param_.filename, test_video_param_.fmt,
         test_video_param_.width, test_video_param_.height,
         test_video_param_.framerate_num, test_video_param_.framerate_den, 0,
-        kFrames);
+        kFrames));
   }
 
-  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
   const int min_run = GetMinVisibleRun();
   const int min_arf_dist_requested = GetMinArfDistanceRequested();
   if (min_run != ARF_NOT_SEEN && min_run != ARF_SEEN_ONCE) {
     const int min_arf_dist = min_run + 1;
     EXPECT_GE(min_arf_dist, min_arf_dist_requested);
   }
-  delete (video);
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH || CONFIG_EXT_REFS
@@ -219,7 +218,7 @@
 // consecutive BWDREF_FRAME's may vary between 1 and any arbitrary positive
 // number as long as it does not exceed the gf_group interval.
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_AV1, ArfFreqTest,
+    DISABLED_AV1, ArfFreqTestLarge,
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
@@ -227,7 +226,7 @@
         ::testing::ValuesIn(kMinArfVectors)));
 #endif  // CONFIG_AV1_ENCODER
 #else
-AV1_INSTANTIATE_TEST_CASE(ArfFreqTest, ::testing::ValuesIn(kTestVectors),
+AV1_INSTANTIATE_TEST_CASE(ArfFreqTestLarge, ::testing::ValuesIn(kTestVectors),
                           ::testing::ValuesIn(kEncodeVectors),
                           ::testing::ValuesIn(kMinArfVectors));
 #endif  // CONFIG_AOM_HIGHBITDEPTH || CONFIG_EXT_REFS

diff --git a/test/av1_convolve_optimz_test.cc b/test/av1_convolve_optimz_test.cc
index f6dc324..b891e99 100644
--- a/test/av1_convolve_optimz_test.cc
+++ b/test/av1_convolve_optimz_test.cc

@@ -12,7 +12,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./av1_rtcd.h"
-#include "av1/common/filter.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -55,7 +54,6 @@
 const size_t maxBlockSize = maxWidth * maxHeight;
 const int horizOffset = 32;
 const int vertiOffset = 32;
-const size_t testMaxBlk = 128;
 const int stride = 128;
 const int x_step_q4 = 16;
 
@@ -91,7 +89,7 @@
   void RunVertFilterBitExactCheck();
 
  private:
-  void PrepFilterBuffer(int w, int h);
+  void PrepFilterBuffer();
   void DiffFilterBuffer();
   conv_filter_t conv_horiz_;
   conv_filter_t conv_vert_;
@@ -107,7 +105,7 @@
   int avg_;
 };
 
-void AV1ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1ConvolveOptimzTest::PrepFilterBuffer() {
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -151,9 +149,9 @@
 }
 
 void AV1ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
-  InterpFilterParams filter_params = get_interp_filter_params(filter_);
+  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
   av1_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
                        filter_params, subpel_, x_step_q4, avg_);
@@ -168,7 +166,7 @@
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   av1_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
                        intermediate_height, filter_params, subpel_, x_step_q4,
@@ -181,9 +179,9 @@
 }
 
 void AV1ConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
-  InterpFilterParams filter_params = get_interp_filter_params(filter_);
+  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
   av1_convolve_vert_c(src_ref_, stride, dst_ref_, stride, width_, height_,
                       filter_params, subpel_, x_step_q4, avg_);
@@ -219,7 +217,7 @@
 const int kSubpelQ4[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
 
 const int kAvg[] = { 0, 1 };
-#endif  // (HAVE_SSSE3 || HAVE_SSE4_1) && CONFIG_EXT_INTERP
+#endif
 
 #if HAVE_SSSE3 && CONFIG_EXT_INTERP
 INSTANTIATE_TEST_CASE_P(
@@ -267,7 +265,7 @@
   void RunVertFilterBitExactCheck();
 
  private:
-  void PrepFilterBuffer(int w, int h);
+  void PrepFilterBuffer();
   void DiffFilterBuffer();
   hbd_conv_filter_t conv_horiz_;
   hbd_conv_filter_t conv_vert_;
@@ -284,7 +282,7 @@
   int bit_depth_;
 };
 
-void AV1HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+void AV1HbdConvolveOptimzTest::PrepFilterBuffer() {
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
@@ -327,9 +325,9 @@
 }
 
 void AV1HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
-  InterpFilterParams filter_params = get_interp_filter_params(filter_);
+  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
   av1_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_, height_,
                               filter_params, subpel_, x_step_q4, avg_,
@@ -345,7 +343,7 @@
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
   av1_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
                               intermediate_height, filter_params, subpel_,
@@ -358,9 +356,9 @@
 }
 
 void AV1HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+  PrepFilterBuffer();
 
-  InterpFilterParams filter_params = get_interp_filter_params(filter_);
+  InterpFilterParams filter_params = av1_get_interp_filter_params(filter_);
 
   av1_highbd_convolve_vert_c(src_, stride, dst_ref_, stride, width_, height_,
                              filter_params, subpel_, x_step_q4, avg_,

diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index fbd939f..0324b8e 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc

@@ -11,6 +11,7 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./av1_rtcd.h"
 #include "./aom_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "av1/common/filter.h"
@@ -21,11 +22,26 @@
 using libaom_test::ACMRandom;
 
 namespace {
+void setup_convolve() {
+#if HAVE_SSSE3 && CONFIG_RUNTIME_CPU_DETECT
+  av1_convolve_horiz = av1_convolve_horiz_c;
+  av1_convolve_vert = av1_convolve_vert_c;
+#endif
+}
+
 TEST(AV1ConvolveTest, av1_convolve8) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  InterpFilter interp_filter = EIGHTTAP;
-  InterpFilterParams filter_params = get_interp_filter_params(interp_filter);
-  ptrdiff_t filter_size = filter_params.taps;
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+                                    EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter[0]);
+#else
+  InterpFilter interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+  int filter_size = filter_params.taps;
   int filter_center = filter_size / 2 - 1;
   uint8_t src[12 * 12];
   int src_stride = filter_size;
@@ -41,18 +57,20 @@
   int w = 1;
   int h = 1;
 
+  setup_convolve();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
   }
 
   av1_convolve(src + src_stride * filter_center + filter_center, src_stride,
-               dst, dst_stride, w, h, &interp_filter, subpel_x_q4, x_step_q4,
+               dst, dst_stride, w, h, interp_filter, subpel_x_q4, x_step_q4,
                subpel_y_q4, y_step_q4, avg);
 
   const int16_t *x_filter =
-      get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+      av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
   const int16_t *y_filter =
-      get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+      av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
 
   aom_convolve8_c(src + src_stride * filter_center + filter_center, src_stride,
                   dst1, dst_stride, x_filter, 16, y_filter, 16, w, h);
@@ -60,9 +78,17 @@
 }
 TEST(AV1ConvolveTest, av1_convolve) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  InterpFilter interp_filter = EIGHTTAP;
-  InterpFilterParams filter_params = get_interp_filter_params(interp_filter);
-  ptrdiff_t filter_size = filter_params.taps;
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+                                    EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter[0]);
+#else
+  InterpFilter interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+  int filter_size = filter_params.taps;
   int filter_center = filter_size / 2 - 1;
   uint8_t src[12 * 12];
   int src_stride = filter_size;
@@ -77,6 +103,8 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
+  setup_convolve();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src[i] = rnd.Rand16() % (1 << 8);
   }
@@ -84,13 +112,13 @@
   for (subpel_x_q4 = 0; subpel_x_q4 < 16; subpel_x_q4++) {
     for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
       av1_convolve(src + src_stride * filter_center + filter_center, src_stride,
-                   dst, dst_stride, w, h, &interp_filter, subpel_x_q4,
-                   x_step_q4, subpel_y_q4, y_step_q4, avg);
+                   dst, dst_stride, w, h, interp_filter, subpel_x_q4, x_step_q4,
+                   subpel_y_q4, y_step_q4, avg);
 
       const int16_t *x_filter =
-          get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+          av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
       const int16_t *y_filter =
-          get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+          av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
 
       int temp[12];
       int dst_ref = 0;
@@ -110,9 +138,17 @@
 
 TEST(AV1ConvolveTest, av1_convolve_avg) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  InterpFilter interp_filter = EIGHTTAP;
-  InterpFilterParams filter_params = get_interp_filter_params(interp_filter);
-  ptrdiff_t filter_size = filter_params.taps;
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+                                    EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter[0]);
+#else
+  InterpFilter interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+  int filter_size = filter_params.taps;
   int filter_center = filter_size / 2 - 1;
   uint8_t src0[12 * 12];
   uint8_t src1[12 * 12];
@@ -131,6 +167,8 @@
   int subpel_x_q4;
   int subpel_y_q4;
 
+  setup_convolve();
+
   for (int i = 0; i < filter_size * filter_size; i++) {
     src0[i] = rnd.Rand16() % (1 << 8);
     src1[i] = rnd.Rand16() % (1 << 8);
@@ -142,19 +180,20 @@
     for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
       avg = 0;
       av1_convolve(src0 + offset, src_stride, dst0, dst_stride, w, h,
-                   &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                   interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                    y_step_q4, avg);
       avg = 0;
       av1_convolve(src1 + offset, src_stride, dst1, dst_stride, w, h,
-                   &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                   interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                    y_step_q4, avg);
+
       avg = 0;
       av1_convolve(src0 + offset, src_stride, dst, dst_stride, w, h,
-                   &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                   interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                    y_step_q4, avg);
       avg = 1;
       av1_convolve(src1 + offset, src_stride, dst, dst_stride, w, h,
-                   &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                   interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                    y_step_q4, avg);
 
       EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
@@ -165,9 +204,17 @@
 #if CONFIG_AOM_HIGHBITDEPTH
 TEST(AV1ConvolveTest, av1_highbd_convolve) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  InterpFilter interp_filter = EIGHTTAP;
-  InterpFilterParams filter_params = get_interp_filter_params(interp_filter);
-  ptrdiff_t filter_size = filter_params.taps;
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+                                    EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter[0]);
+#else
+  InterpFilter interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+  int filter_size = filter_params.taps;
   int filter_center = filter_size / 2 - 1;
   uint16_t src[12 * 12];
   int src_stride = filter_size;
@@ -190,14 +237,14 @@
   for (subpel_x_q4 = 0; subpel_x_q4 < 16; subpel_x_q4++) {
     for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
       av1_highbd_convolve(
-          CONVERT_TO_BYTEPTR(src) + src_stride * filter_center + filter_center,
-          src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, &interp_filter,
+          CONVERT_TO_BYTEPTR(src + src_stride * filter_center + filter_center),
+          src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, interp_filter,
           subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
 
       const int16_t *x_filter =
-          get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+          av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
       const int16_t *y_filter =
-          get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+          av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
 
       int temp[12];
       int dst_ref = 0;
@@ -218,9 +265,17 @@
 
 TEST(AV1ConvolveTest, av1_highbd_convolve_avg) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  InterpFilter interp_filter = EIGHTTAP;
-  InterpFilterParams filter_params = get_interp_filter_params(interp_filter);
-  ptrdiff_t filter_size = filter_params.taps;
+#if CONFIG_DUAL_FILTER
+  InterpFilter interp_filter[4] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+                                    EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter[0]);
+#else
+  InterpFilter interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      av1_get_interp_filter_params(interp_filter);
+#endif
+  int filter_size = filter_params.taps;
   int filter_center = filter_size / 2 - 1;
   uint16_t src0[12 * 12];
   uint16_t src1[12 * 12];
@@ -250,25 +305,25 @@
       int offset = filter_size * filter_center + filter_center;
 
       avg = 0;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0) + offset, src_stride,
+      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
-                          &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
       avg = 0;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1) + offset, src_stride,
+      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
-                          &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
 
       avg = 0;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0) + offset, src_stride,
+      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
-                          &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
       avg = 1;
-      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1) + offset, src_stride,
+      av1_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
-                          &interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                          interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
                           y_step_q4, avg, bd);
 
       EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
@@ -276,4 +331,132 @@
   }
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#define CONVOLVE_SPEED_TEST 0
+#if CONVOLVE_SPEED_TEST
+#define highbd_convolve_speed(func, block_size, frame_size)                  \
+  TEST(AV1ConvolveTest, func##_speed_##block_size##_##frame_size) {          \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                           \
+    InterpFilter interp_filter = EIGHTTAP;                                   \
+    InterpFilterParams filter_params =                                       \
+        av1_get_interp_filter_params(interp_filter);                         \
+    int filter_size = filter_params.tap;                                     \
+    int filter_center = filter_size / 2 - 1;                                 \
+    DECLARE_ALIGNED(16, uint16_t,                                            \
+                    src[(frame_size + 7) * (frame_size + 7)]) = { 0 };       \
+    int src_stride = frame_size + 7;                                         \
+    DECLARE_ALIGNED(16, uint16_t, dst[frame_size * frame_size]) = { 0 };     \
+    int dst_stride = frame_size;                                             \
+    int x_step_q4 = 16;                                                      \
+    int y_step_q4 = 16;                                                      \
+    int subpel_x_q4 = 8;                                                     \
+    int subpel_y_q4 = 6;                                                     \
+    int bd = 10;                                                             \
+                                                                             \
+    int w = block_size;                                                      \
+    int h = block_size;                                                      \
+                                                                             \
+    const int16_t *filter_x =                                                \
+        av1_get_interp_filter_kernel(filter_params, subpel_x_q4);            \
+    const int16_t *filter_y =                                                \
+        av1_get_interp_filter_kernel(filter_params, subpel_y_q4);            \
+                                                                             \
+    for (int i = 0; i < src_stride * src_stride; i++) {                      \
+      src[i] = rnd.Rand16() % (1 << bd);                                     \
+    }                                                                        \
+                                                                             \
+    int offset = filter_center * src_stride + filter_center;                 \
+    int row_offset = 0;                                                      \
+    int col_offset = 0;                                                      \
+    for (int i = 0; i < 100000; i++) {                                       \
+      int src_total_offset = offset + col_offset * src_stride + row_offset;  \
+      int dst_total_offset = col_offset * dst_stride + row_offset;           \
+      func(CONVERT_TO_BYTEPTR(src + src_total_offset), src_stride,           \
+           CONVERT_TO_BYTEPTR(dst + dst_total_offset), dst_stride, filter_x, \
+           x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+      if (offset + w + w < frame_size) {                                     \
+        row_offset += w;                                                     \
+      } else {                                                               \
+        row_offset = 0;                                                      \
+        col_offset += h;                                                     \
+      }                                                                      \
+      if (col_offset + h >= frame_size) {                                    \
+        col_offset = 0;                                                      \
+      }                                                                      \
+    }                                                                        \
+  }
+
+#define lowbd_convolve_speed(func, block_size, frame_size)                  \
+  TEST(AV1ConvolveTest, func##_speed_l_##block_size##_##frame_size) {       \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                          \
+    InterpFilter interp_filter = EIGHTTAP;                                  \
+    InterpFilterParams filter_params =                                      \
+        av1_get_interp_filter_params(interp_filter);                        \
+    int filter_size = filter_params.tap;                                    \
+    int filter_center = filter_size / 2 - 1;                                \
+    DECLARE_ALIGNED(16, uint8_t, src[(frame_size + 7) * (frame_size + 7)]); \
+    int src_stride = frame_size + 7;                                        \
+    DECLARE_ALIGNED(16, uint8_t, dst[frame_size * frame_size]);             \
+    int dst_stride = frame_size;                                            \
+    int x_step_q4 = 16;                                                     \
+    int y_step_q4 = 16;                                                     \
+    int subpel_x_q4 = 8;                                                    \
+    int subpel_y_q4 = 6;                                                    \
+    int bd = 8;                                                             \
+                                                                            \
+    int w = block_size;                                                     \
+    int h = block_size;                                                     \
+                                                                            \
+    const int16_t *filter_x =                                               \
+        av1_get_interp_filter_kernel(filter_params, subpel_x_q4);           \
+    const int16_t *filter_y =                                               \
+        av1_get_interp_filter_kernel(filter_params, subpel_y_q4);           \
+                                                                            \
+    for (int i = 0; i < src_stride * src_stride; i++) {                     \
+      src[i] = rnd.Rand16() % (1 << bd);                                    \
+    }                                                                       \
+                                                                            \
+    int offset = filter_center * src_stride + filter_center;                \
+    int row_offset = 0;                                                     \
+    int col_offset = 0;                                                     \
+    for (int i = 0; i < 100000; i++) {                                      \
+      func(src + offset, src_stride, dst, dst_stride, filter_x, x_step_q4,  \
+           filter_y, y_step_q4, w, h);                                      \
+      if (offset + w + w < frame_size) {                                    \
+        row_offset += w;                                                    \
+      } else {                                                              \
+        row_offset = 0;                                                     \
+        col_offset += h;                                                    \
+      }                                                                     \
+      if (col_offset + h >= frame_size) {                                   \
+        col_offset = 0;                                                     \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// This experiment shows that when frame size is 64x64
+// aom_highbd_convolve8_sse2 and aom_convolve8_sse2's speed are similar.
+// However when frame size becomes 1024x1024
+// aom_highbd_convolve8_sse2 is around 50% slower than aom_convolve8_sse2
+// we think the bottleneck is from memory IO
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 8, 64);
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 16, 64);
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 32, 64);
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 64, 64);
+
+lowbd_convolve_speed(aom_convolve8_sse2, 8, 64);
+lowbd_convolve_speed(aom_convolve8_sse2, 16, 64);
+lowbd_convolve_speed(aom_convolve8_sse2, 32, 64);
+lowbd_convolve_speed(aom_convolve8_sse2, 64, 64);
+
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 8, 1024);
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 16, 1024);
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 32, 1024);
+highbd_convolve_speed(aom_highbd_convolve8_sse2, 64, 1024);
+
+lowbd_convolve_speed(aom_convolve8_sse2, 8, 1024);
+lowbd_convolve_speed(aom_convolve8_sse2, 16, 1024);
+lowbd_convolve_speed(aom_convolve8_sse2, 32, 1024);
+lowbd_convolve_speed(aom_convolve8_sse2, 64, 1024);
+#endif  // CONVOLVE_SPEED_TEST
 }  // namespace

diff --git a/test/av1_dct_test.cc b/test/av1_dct_test.cc
index ac1a551..d5c23f6 100644
--- a/test/av1_dct_test.cc
+++ b/test/av1_dct_test.cc

@@ -102,5 +102,6 @@
     C, AV1FwdTxfm,
     ::testing::Values(FdctParam(&fdct4, &reference_dct_1d, 4, 1),
                       FdctParam(&fdct8, &reference_dct_1d, 8, 1),
-                      FdctParam(&fdct16, &reference_dct_1d, 16, 2)));
+                      FdctParam(&fdct16, &reference_dct_1d, 16, 2),
+                      FdctParam(&fdct32, &reference_dct_1d, 32, 3)));
 }  // namespace

diff --git a/test/av1_ext_tile_test.cc b/test/av1_ext_tile_test.cc
new file mode 100644
index 0000000..9563fd5
--- /dev/null
+++ b/test/av1_ext_tile_test.cc

@@ -0,0 +1,196 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+
+namespace {
+// The number of frames to be encoded/decoded
+const int kLimit = 8;
+// Skip 1 frame to check the frame decoding independency.
+const int kSkip = 5;
+const int kTileSize = 1;
+const int kTIleSizeInPixels = (kTileSize << 6);
+// Fake width and height so that they can be multiples of the tile size.
+const int kImgWidth = 704;
+const int kImgHeight = 576;
+
+class AV1ExtTileTest
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int> {
+ protected:
+  AV1ExtTileTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = kImgWidth;
+    cfg.h = kImgHeight;
+
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+    decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+
+    // Allocate buffer to store tile image.
+    aom_img_alloc(&tile_img_, AOM_IMG_FMT_I420, kImgWidth, kImgHeight, 32);
+
+    md5_.clear();
+    tile_md5_.clear();
+  }
+
+  virtual ~AV1ExtTileTest() {
+    aom_img_free(&tile_img_);
+    delete decoder_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = AOM_VBR;
+    cfg_.g_error_resilient = 1;
+
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      // Encode setting
+      encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
+      encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+
+      // The tile size is 64x64.
+      encoder->Control(AV1E_SET_TILE_COLUMNS, kTileSize);
+      encoder->Control(AV1E_SET_TILE_ROWS, kTileSize);
+#if CONFIG_EXT_PARTITION
+      // Always use 64x64 max partition.
+      encoder->Control(AV1E_SET_SUPERBLOCK_SIZE, AOM_SUPERBLOCK_SIZE_64X64);
+#endif
+    }
+
+    if (video->frame() == 1) {
+      frame_flags_ =
+          AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF;
+    }
+  }
+
+  virtual void DecompressedFrameHook(const aom_image_t &img,
+                                     aom_codec_pts_t pts) {
+    // Skip 1 already decoded frame to be consistent with the decoder in this
+    // test.
+    if (pts == (aom_codec_pts_t)kSkip) return;
+
+    // Calculate MD5 as the reference.
+    ::libaom_test::MD5 md5_res;
+    md5_res.Add(&img);
+    md5_.push_back(md5_res.Get());
+  }
+
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    // Skip decoding 1 frame.
+    if (pkt->data.frame.pts == (aom_codec_pts_t)kSkip) return;
+
+    bool IsLastFrame = (pkt->data.frame.pts == (aom_codec_pts_t)(kLimit - 1));
+
+    // Decode the first (kLimit - 1) frames as whole frame, and decode the last
+    // frame in single tiles.
+    for (int r = 0; r < kImgHeight / kTIleSizeInPixels; ++r) {
+      for (int c = 0; c < kImgWidth / kTIleSizeInPixels; ++c) {
+        if (!IsLastFrame) {
+          decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+          decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+        } else {
+          decoder_->Control(AV1_SET_DECODE_TILE_ROW, r);
+          decoder_->Control(AV1_SET_DECODE_TILE_COL, c);
+        }
+
+        const aom_codec_err_t res = decoder_->DecodeFrame(
+            reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+            pkt->data.frame.sz);
+        if (res != AOM_CODEC_OK) {
+          abort_ = true;
+          ASSERT_EQ(AOM_CODEC_OK, res);
+        }
+        const aom_image_t *img = decoder_->GetDxData().Next();
+
+        if (!IsLastFrame) {
+          if (img) {
+            ::libaom_test::MD5 md5_res;
+            md5_res.Add(img);
+            tile_md5_.push_back(md5_res.Get());
+          }
+          break;
+        }
+
+        const int kMaxMBPlane = 3;
+        for (int plane = 0; plane < kMaxMBPlane; ++plane) {
+          const int shift = (plane == 0) ? 0 : 1;
+          int tile_height = kTIleSizeInPixels >> shift;
+          int tile_width = kTIleSizeInPixels >> shift;
+
+          for (int tr = 0; tr < tile_height; ++tr) {
+            memcpy(tile_img_.planes[plane] +
+                       tile_img_.stride[plane] * (r * tile_height + tr) +
+                       c * tile_width,
+                   img->planes[plane] + img->stride[plane] * tr, tile_width);
+          }
+        }
+      }
+
+      if (!IsLastFrame) break;
+    }
+
+    if (IsLastFrame) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(&tile_img_);
+      tile_md5_.push_back(md5_res.Get());
+    }
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  ::libaom_test::Decoder *decoder_;
+  aom_image_t tile_img_;
+  std::vector<std::string> md5_;
+  std::vector<std::string> tile_md5_;
+};
+
+TEST_P(AV1ExtTileTest, DecoderResultTest) {
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", kImgWidth,
+                                       kImgHeight, 30, 1, 0, kLimit);
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_error_resilient = AOM_ERROR_RESILIENT_DEFAULT;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 1;
+
+  // Tile encoding
+  init_flags_ = AOM_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(md5_, tile_md5_);
+}
+
+AV1_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    AV1ExtTileTest, ::testing::Values(::libaom_test::kTwoPassGood),
+    ::testing::Range(0, 4));
+}  // namespace

diff --git a/test/av1_fht16x16_test.cc b/test/av1_fht16x16_test.cc
new file mode 100644
index 0000000..0b89071
--- /dev/null
+++ b/test/av1_fht16x16_test.cc

@@ -0,0 +1,257 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht16x16Param;
+
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht16x16_c(in, out, stride, tx_type);
+}
+
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                  int tx_type) {
+  av1_iht16x16_256_add_c(in, dest, stride, tx_type);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt16x16Param;
+
+void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride,
+                         int tx_type, int bd) {
+  av1_fwd_txfm2d_16x16_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+class AV1Trans16x16HT : public libaom_test::TransformTestBase,
+                        public ::testing::TestWithParam<Ht16x16Param> {
+ public:
+  virtual ~AV1Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 16;
+    height_ = 16;
+    fwd_txfm_ref = fht16x16_ref;
+    inv_txfm_ref = iht16x16_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans16x16HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HighbdTrans16x16HT
+    : public ::testing::TestWithParam<HighbdHt16x16Param> {
+ public:
+  virtual ~AV1HighbdTrans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht16x16_ref;
+    tx_type_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 256;
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void AV1HighbdTrans16x16HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 16;
+  const int num_tests = 1000;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
+
+    for (j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdTrans16x16HT, HighbdCoeffCheck) { RunBitexactCheck(); }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht16x16Param kArrayHt16x16Param_sse2[] = {
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 0, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 1, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 2, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 3, AOM_BITS_8,
+             256),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 4, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 5, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 6, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 7, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 8, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 10, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 11, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 12, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 13, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 14, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2, 15, AOM_BITS_8,
+             256)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x16HT,
+                        ::testing::ValuesIn(kArrayHt16x16Param_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_AVX2
+const Ht16x16Param kArrayHt16x16Param_avx2[] = {
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 0, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 1, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 2, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 3, AOM_BITS_8, 256),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 4, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 5, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 6, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 7, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 8, AOM_BITS_8, 256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 10, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 11, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 12, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 13, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 14, AOM_BITS_8,
+             256),
+  make_tuple(&av1_fht16x16_avx2, av1_iht16x16_256_add_avx2, 15, AOM_BITS_8, 256)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans16x16HT,
+                        ::testing::ValuesIn(kArrayHt16x16Param_avx2));
+#endif  // HAVE_AVX2
+
+#if HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 0, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 0, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 1, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 1, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 2, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 2, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 3, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 4, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 4, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 5, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 5, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 6, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 6, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 7, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 7, 12),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 8, 10),
+  make_tuple(&av1_fwd_txfm2d_16x16_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans16x16HT,
+                        ::testing::ValuesIn(kArrayHBDHt16x16Param_sse4_1));
+#endif  // HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+
+}  // namespace

diff --git a/test/av1_fht16x32_test.cc b/test/av1_fht16x32_test.cc
new file mode 100644
index 0000000..8ff96b3
--- /dev/null
+++ b/test/av1_fht16x32_test.cc

@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht16x32Param;
+
+void fht16x32_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht16x32_c(in, out, stride, tx_type);
+}
+
+void iht16x32_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  av1_iht16x32_512_add_c(in, out, stride, tx_type);
+}
+
+class AV1Trans16x32HT : public libaom_test::TransformTestBase,
+                        public ::testing::TestWithParam<Ht16x32Param> {
+ public:
+  virtual ~AV1Trans16x32HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 16;
+    height_ = 32;
+    fwd_txfm_ref = fht16x32_ref;
+    inv_txfm_ref = iht16x32_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans16x32HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x32HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht16x32Param kArrayHt16x32Param_sse2[] = {
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 0, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 1, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 2, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 3, AOM_BITS_8,
+             512),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 4, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 5, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 6, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 7, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 8, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 9, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 10, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 11, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 12, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 13, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 14, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht16x32_sse2, &av1_iht16x32_512_add_sse2, 15, AOM_BITS_8,
+             512)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x32HT,
+                        ::testing::ValuesIn(kArrayHt16x32Param_sse2));
+#endif  // HAVE_SSE2
+
+}  // namespace

diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc
new file mode 100644
index 0000000..c2878c4
--- /dev/null
+++ b/test/av1_fht16x8_test.cc

@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht16x8Param;
+
+void fht16x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht16x8_c(in, out, stride, tx_type);
+}
+
+void iht16x8_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  av1_iht16x8_128_add_c(in, out, stride, tx_type);
+}
+
+class AV1Trans16x8HT : public libaom_test::TransformTestBase,
+                       public ::testing::TestWithParam<Ht16x8Param> {
+ public:
+  virtual ~AV1Trans16x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 16;
+    height_ = 8;
+    inv_txfm_ref = iht16x8_ref;
+    fwd_txfm_ref = fht16x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht16x8Param kArrayHt16x8Param_sse2[] = {
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 0, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 1, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 2, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 3, AOM_BITS_8, 128),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 4, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 5, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 6, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 7, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 8, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 9, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 10, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 11, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 12, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 13, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 14, AOM_BITS_8, 128),
+  make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 15, AOM_BITS_8, 128)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x8HT,
+                        ::testing::ValuesIn(kArrayHt16x8Param_sse2));
+#endif  // HAVE_SSE2
+
+}  // namespace

diff --git a/test/av1_fht32x16_test.cc b/test/av1_fht32x16_test.cc
new file mode 100644
index 0000000..41c0b1c
--- /dev/null
+++ b/test/av1_fht32x16_test.cc

@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht32x16Param;
+
+void fht32x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht32x16_c(in, out, stride, tx_type);
+}
+
+void iht32x16_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  av1_iht32x16_512_add_c(in, out, stride, tx_type);
+}
+
+class AV1Trans32x16HT : public libaom_test::TransformTestBase,
+                        public ::testing::TestWithParam<Ht32x16Param> {
+ public:
+  virtual ~AV1Trans32x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 32;
+    height_ = 16;
+    fwd_txfm_ref = fht32x16_ref;
+    inv_txfm_ref = iht32x16_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans32x16HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans32x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht32x16Param kArrayHt32x16Param_sse2[] = {
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 0, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 1, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 2, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 3, AOM_BITS_8,
+             512),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 4, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 5, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 6, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 7, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 8, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 9, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 10, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 11, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 12, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 13, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 14, AOM_BITS_8,
+             512),
+  make_tuple(&av1_fht32x16_sse2, &av1_iht32x16_512_add_sse2, 15, AOM_BITS_8,
+             512)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x16HT,
+                        ::testing::ValuesIn(kArrayHt32x16Param_sse2));
+#endif  // HAVE_SSE2
+
+}  // namespace

diff --git a/test/av1_fht4x4_test.cc b/test/av1_fht4x4_test.cc
new file mode 100644
index 0000000..075646c
--- /dev/null
+++ b/test/av1_fht4x4_test.cc

@@ -0,0 +1,208 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht4x4Param;
+
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht4x4_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                              int tx_type, int bd);
+typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
+                           int tx_type, int bd);
+
+// HighbdHt4x4Param argument list:
+// <Target optimized function, tx_type, bit depth>
+typedef tuple<HBDFhtFunc, int, int> HighbdHt4x4Param;
+
+void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride, int tx_type,
+                       int bd) {
+  av1_fwd_txfm2d_4x4_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+class AV1Trans4x4HT : public libaom_test::TransformTestBase,
+                      public ::testing::TestWithParam<Ht4x4Param> {
+ public:
+  virtual ~AV1Trans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 4;
+    height_ = 4;
+    fwd_txfm_ref = fht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HighbdTrans4x4HT : public ::testing::TestWithParam<HighbdHt4x4Param> {
+ public:
+  virtual ~AV1HighbdTrans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbe_fht4x4_ref;
+    tx_type_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 16;
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HBDFhtFunc fwd_txfm_;
+  HBDFhtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void AV1HighbdTrans4x4HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 4;
+  const int num_tests = 1000;
+  const int num_coeffs = 16;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_);
+
+    for (j = 0; j < num_coeffs; ++j) {
+      EXPECT_EQ(output_[j], output_ref_[j])
+          << "Not bit-exact result at index: " << j << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdTrans4x4HT, HighbdCoeffCheck) { RunBitexactCheck(); }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht4x4Param kArrayHt4x4Param_sse2[] = {
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 0, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 1, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 2, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 3, AOM_BITS_8, 16),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 4, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 5, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 6, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 7, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 8, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 10, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 11, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 12, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 13, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 14, AOM_BITS_8, 16),
+  make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 15, AOM_BITS_8, 16)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans4x4HT,
+                        ::testing::ValuesIn(kArrayHt4x4Param_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 0, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 0, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 1, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 1, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 2, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 2, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 3, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 4, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 4, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 5, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 5, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 6, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 6, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 7, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 7, 12),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 8, 10),
+  make_tuple(&av1_fwd_txfm2d_4x4_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans4x4HT,
+                        ::testing::ValuesIn(kArrayHighbdHt4x4Param));
+
+#endif  // HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+
+}  // namespace

diff --git a/test/av1_fht4x8_test.cc b/test/av1_fht4x8_test.cc
new file mode 100644
index 0000000..4a5f3ff
--- /dev/null
+++ b/test/av1_fht4x8_test.cc

@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht4x8Param;
+
+void fht4x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht4x8_c(in, out, stride, tx_type);
+}
+
+void iht4x8_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  av1_iht4x8_32_add_c(in, out, stride, tx_type);
+}
+
+class AV1Trans4x8HT : public libaom_test::TransformTestBase,
+                      public ::testing::TestWithParam<Ht4x8Param> {
+ public:
+  virtual ~AV1Trans4x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 4;
+    height_ = 8;
+    fwd_txfm_ref = fht4x8_ref;
+    inv_txfm_ref = iht4x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans4x8HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans4x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht4x8Param kArrayHt4x8Param_sse2[] = {
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 0, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 1, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 2, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 3, AOM_BITS_8, 32),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 4, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 5, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 6, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 7, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 8, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 9, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 10, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 11, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 12, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 13, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 14, AOM_BITS_8, 32),
+  make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 15, AOM_BITS_8, 32)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans4x8HT,
+                        ::testing::ValuesIn(kArrayHt4x8Param_sse2));
+#endif  // HAVE_SSE2
+
+}  // namespace

diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc
new file mode 100644
index 0000000..42907fd
--- /dev/null
+++ b/test/av1_fht8x16_test.cc

@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht8x16Param;
+
+void fht8x16_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht8x16_c(in, out, stride, tx_type);
+}
+
+void iht8x16_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  av1_iht8x16_128_add_c(in, out, stride, tx_type);
+}
+
+class AV1Trans8x16HT : public libaom_test::TransformTestBase,
+                       public ::testing::TestWithParam<Ht8x16Param> {
+ public:
+  virtual ~AV1Trans8x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 8;
+    height_ = 16;
+    inv_txfm_ref = iht8x16_ref;
+    fwd_txfm_ref = fht8x16_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans8x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht8x16Param kArrayHt8x16Param_sse2[] = {
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 0, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 1, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 2, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 3, AOM_BITS_8, 128),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 4, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 5, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 6, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 7, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 8, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 9, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 10, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 11, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 12, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 13, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 14, AOM_BITS_8, 128),
+  make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 15, AOM_BITS_8, 128)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x16HT,
+                        ::testing::ValuesIn(kArrayHt8x16Param_sse2));
+#endif  // HAVE_SSE2
+
+}  // namespace

diff --git a/test/av1_fht8x4_test.cc b/test/av1_fht8x4_test.cc
new file mode 100644
index 0000000..46e8c62
--- /dev/null
+++ b/test/av1_fht8x4_test.cc

@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht8x4Param;
+
+void fht8x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht8x4_c(in, out, stride, tx_type);
+}
+
+void iht8x4_ref(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  av1_iht8x4_32_add_c(in, out, stride, tx_type);
+}
+
+class AV1Trans8x4HT : public libaom_test::TransformTestBase,
+                      public ::testing::TestWithParam<Ht8x4Param> {
+ public:
+  virtual ~AV1Trans8x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 8;
+    height_ = 4;
+    fwd_txfm_ref = fht8x4_ref;
+    inv_txfm_ref = iht8x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans8x4HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans8x4HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht8x4Param kArrayHt8x4Param_sse2[] = {
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 0, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 1, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 2, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 3, AOM_BITS_8, 32),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 4, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 5, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 6, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 7, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 8, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 9, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 10, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 11, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 12, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 13, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 14, AOM_BITS_8, 32),
+  make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 15, AOM_BITS_8, 32)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x4HT,
+                        ::testing::ValuesIn(kArrayHt8x4Param_sse2));
+#endif  // HAVE_SSE2
+
+}  // namespace

diff --git a/test/av1_fht8x8_test.cc b/test/av1_fht8x8_test.cc
new file mode 100644
index 0000000..54cb405
--- /dev/null
+++ b/test/av1_fht8x8_test.cc

@@ -0,0 +1,206 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+using libaom_test::FhtFunc;
+using std::tr1::tuple;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht8x8Param;
+
+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht8x8_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt8x8Param;
+
+void highbd_fht8x8_ref(const int16_t *in, int32_t *out, int stride, int tx_type,
+                       int bd) {
+  av1_fwd_txfm2d_8x8_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+class AV1Trans8x8HT : public libaom_test::TransformTestBase,
+                      public ::testing::TestWithParam<Ht8x8Param> {
+ public:
+  virtual ~AV1Trans8x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 8;
+    height_ = 8;
+    fwd_txfm_ref = fht8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans8x8HT, CoeffCheck) { RunCoeffCheck(); }
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HighbdTrans8x8HT : public ::testing::TestWithParam<HighbdHt8x8Param> {
+ public:
+  virtual ~AV1HighbdTrans8x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht8x8_ref;
+    tx_type_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 64;
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void AV1HighbdTrans8x8HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 8;
+  const int num_tests = 1000;
+  const int num_coeffs = 64;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
+
+    for (j = 0; j < num_coeffs; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdTrans8x8HT, HighbdCoeffCheck) { RunBitexactCheck(); }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht8x8Param kArrayHt8x8Param_sse2[] = {
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 0, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 1, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 2, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 3, AOM_BITS_8, 64),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 4, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 5, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 6, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 7, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 8, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 10, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 11, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 12, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 13, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 14, AOM_BITS_8, 64),
+  make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_sse2, 15, AOM_BITS_8, 64)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x8HT,
+                        ::testing::ValuesIn(kArrayHt8x8Param_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = {
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 0, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 0, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 1, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 1, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 2, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 2, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 3, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 4, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 4, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 5, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 5, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 6, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 6, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 7, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 7, 12),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 8, 10),
+  make_tuple(&av1_fwd_txfm2d_8x8_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdTrans8x8HT,
+                        ::testing::ValuesIn(kArrayHBDHt8x8Param_sse4_1));
+#endif  // HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+
+}  // namespace

diff --git a/test/av1_fwd_txfm1d_test.cc b/test/av1_fwd_txfm1d_test.cc
new file mode 100644
index 0000000..f671097
--- /dev/null
+++ b/test/av1_fwd_txfm1d_test.cc

@@ -0,0 +1,132 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "test/av1_txfm_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+using libaom_test::reference_hybrid_1d;
+using libaom_test::TYPE_TXFM;
+using libaom_test::TYPE_DCT;
+using libaom_test::TYPE_ADST;
+
+namespace {
+const int txfm_type_num = 2;
+const TYPE_TXFM txfm_type_ls[2] = { TYPE_DCT, TYPE_ADST };
+
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[2][5] = {
+#if CONFIG_TX64X64
+  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new,
+    av1_fdct64_new },
+#else
+  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new, NULL },
+#endif
+  { av1_fadst4_new, av1_fadst8_new, av1_fadst16_new, av1_fadst32_new, NULL }
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit[12] = { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14 };
+const int8_t range_bit[12] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 };
+
+TEST(av1_fwd_txfm1d, round_shift) {
+  EXPECT_EQ(round_shift(7, 1), 4);
+  EXPECT_EQ(round_shift(-7, 1), -3);
+
+  EXPECT_EQ(round_shift(7, 2), 2);
+  EXPECT_EQ(round_shift(-7, 2), -2);
+
+  EXPECT_EQ(round_shift(8, 2), 2);
+  EXPECT_EQ(round_shift(-8, 2), -2);
+}
+
+TEST(av1_fwd_txfm1d, get_max_bit) {
+  int max_bit = get_max_bit(8);
+  EXPECT_EQ(max_bit, 3);
+}
+
+TEST(av1_fwd_txfm1d, cospi_arr) {
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 64; j++) {
+      EXPECT_EQ(cospi_arr[i][j],
+                (int32_t)round(cos(M_PI * j / 128) * (1 << (cos_bit_min + i))));
+    }
+  }
+}
+
+TEST(av1_fwd_txfm1d, clamp_block) {
+  int16_t block[5][5] = { { 7, -5, 6, -3, 9 },
+                          { 7, -5, 6, -3, 9 },
+                          { 7, -5, 6, -3, 9 },
+                          { 7, -5, 6, -3, 9 },
+                          { 7, -5, 6, -3, 9 } };
+
+  int16_t ref_block[5][5] = { { 7, -5, 6, -3, 9 },
+                              { 7, -5, 6, -3, 9 },
+                              { 7, -4, 2, -3, 9 },
+                              { 7, -4, 2, -3, 9 },
+                              { 7, -4, 2, -3, 9 } };
+
+  int row = 2;
+  int col = 1;
+  int block_size = 3;
+  int stride = 5;
+  clamp_block(block[row] + col, block_size, stride, -4, 2);
+  for (int r = 0; r < stride; r++) {
+    for (int c = 0; c < stride; c++) {
+      EXPECT_EQ(block[r][c], ref_block[r][c]);
+    }
+  }
+}
+
+TEST(av1_fwd_txfm1d, accuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    double *ref_input = new double[txfm_size];
+    double *ref_output = new double[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TYPE_TXFM txfm_type = txfm_type_ls[ti];
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[ti][si];
+      int max_error = 7;
+
+      const int count_test_block = 5000;
+      if (fwd_txfm_func != NULL) {
+        for (int ti = 0; ti < count_test_block; ++ti) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+            ref_input[ni] = static_cast<double>(input[ni]);
+          }
+
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
+
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            EXPECT_LE(
+                abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
+                max_error);
+          }
+        }
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+}
+}  // namespace

diff --git a/test/av1_fwd_txfm2d_test.cc b/test/av1_fwd_txfm2d_test.cc
new file mode 100644
index 0000000..675edb0
--- /dev/null
+++ b/test/av1_fwd_txfm2d_test.cc

@@ -0,0 +1,178 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/av1_txfm_test.h"
+#include "av1/common/av1_txfm.h"
+#include "./av1_rtcd.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::Fwd_Txfm2d_Func;
+using libaom_test::TYPE_TXFM;
+
+namespace {
+#if CONFIG_AOM_HIGHBITDEPTH
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tr1::tuple<TX_TYPE, TX_SIZE, double, double> AV1FwdTxfm2dParam;
+
+class AV1FwdTxfm2d : public ::testing::TestWithParam<AV1FwdTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    count_ = 500;
+    TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg =
+        av1_get_fwd_txfm_cfg(tx_type_, tx_size_);
+    const TXFM_2D_CFG *fwd_txfm_cfg = fwd_txfm_flip_cfg.cfg;
+    int amplify_bit = fwd_txfm_cfg->shift[0] + fwd_txfm_cfg->shift[1] +
+                      fwd_txfm_cfg->shift[2];
+    ud_flip_ = fwd_txfm_flip_cfg.ud_flip;
+    lr_flip_ = fwd_txfm_flip_cfg.lr_flip;
+    amplify_factor_ =
+        amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+    fwd_txfm_ = libaom_test::fwd_txfm_func_ls[tx_size_];
+    txfm1d_size_ = libaom_test::get_txfm1d_size(tx_size_);
+    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
+    get_txfm1d_type(tx_type_, &type0_, &type1_);
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(input_[0]) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(output_[0]) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<double *>(
+        aom_memalign(16, sizeof(ref_input_[0]) * txfm2d_size_));
+    ref_output_ = reinterpret_cast<double *>(
+        aom_memalign(16, sizeof(ref_output_[0]) * txfm2d_size_));
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double avg_abs_error = 0;
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        input_[ni] = rnd.Rand16() % input_base;
+        ref_input_[ni] = static_cast<double>(input_[ni]);
+        output_[ni] = 0;
+        ref_output_[ni] = 0;
+      }
+
+      fwd_txfm_(input_, output_, txfm1d_size_, tx_type_, bd);
+
+      if (lr_flip_ && ud_flip_)
+        libaom_test::fliplrud(ref_input_, txfm1d_size_, txfm1d_size_);
+      else if (lr_flip_)
+        libaom_test::fliplr(ref_input_, txfm1d_size_, txfm1d_size_);
+      else if (ud_flip_)
+        libaom_test::flipud(ref_input_, txfm1d_size_, txfm1d_size_);
+
+      reference_hybrid_2d(ref_input_, ref_output_, txfm1d_size_, type0_,
+                          type1_);
+
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        ref_output_[ni] = round(ref_output_[ni] * amplify_factor_);
+        EXPECT_GE(max_error_,
+                  fabs(output_[ni] - ref_output_[ni]) / amplify_factor_);
+      }
+      avg_abs_error += compute_avg_abs_error<int32_t, double>(
+          output_, ref_output_, txfm2d_size_);
+    }
+
+    avg_abs_error /= amplify_factor_;
+    avg_abs_error /= count_;
+    // max_abs_avg_error comes from upper bound of avg_abs_error
+    // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
+    // %f\n", type0_, type1_, txfm1d_size_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error);
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(ref_input_);
+    aom_free(ref_output_);
+  }
+
+ private:
+  double max_error_;
+  double max_avg_error_;
+  int count_;
+  double amplify_factor_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int txfm1d_size_;
+  int txfm2d_size_;
+  Fwd_Txfm2d_Func fwd_txfm_;
+  TYPE_TXFM type0_;
+  TYPE_TXFM type1_;
+  int16_t *input_;
+  int32_t *output_;
+  double *ref_input_;
+  double *ref_output_;
+  int ud_flip_;  // flip upside down
+  int lr_flip_;  // flip left to right
+};
+
+TEST_P(AV1FwdTxfm2d, RunFwdAccuracyCheck) { RunFwdAccuracyCheck(); }
+const AV1FwdTxfm2dParam av1_fwd_txfm2d_param_c[] = {
+#if CONFIG_EXT_TX
+  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(FLIPADST_DCT, TX_32X32, 70, 7),
+  AV1FwdTxfm2dParam(DCT_FLIPADST, TX_32X32, 70, 7),
+  AV1FwdTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 70, 7),
+  AV1FwdTxfm2dParam(ADST_FLIPADST, TX_32X32, 70, 7),
+  AV1FwdTxfm2dParam(FLIPADST_ADST, TX_32X32, 70, 7),
+#endif
+  AV1FwdTxfm2dParam(DCT_DCT, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(ADST_DCT, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(DCT_ADST, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.2),
+  AV1FwdTxfm2dParam(DCT_DCT, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(ADST_DCT, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(DCT_ADST, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(ADST_ADST, TX_8X8, 5, 0.6),
+  AV1FwdTxfm2dParam(DCT_DCT, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(ADST_DCT, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(DCT_ADST, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(ADST_ADST, TX_16X16, 11, 1.5),
+  AV1FwdTxfm2dParam(DCT_DCT, TX_32X32, 70, 7),
+  AV1FwdTxfm2dParam(ADST_DCT, TX_32X32, 70, 7),
+  AV1FwdTxfm2dParam(DCT_ADST, TX_32X32, 70, 7),
+  AV1FwdTxfm2dParam(ADST_ADST, TX_32X32, 70, 7)
+};
+
+INSTANTIATE_TEST_CASE_P(C, AV1FwdTxfm2d,
+                        ::testing::ValuesIn(av1_fwd_txfm2d_param_c));
+
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+}  // namespace

diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
new file mode 100644
index 0000000..648e744
--- /dev/null
+++ b/test/av1_highbd_iht_test.cc

@@ -0,0 +1,218 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+using std::tr1::tuple;
+using libaom_test::ACMRandom;
+
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
+                           int tx_type, int bd);
+
+// Test parameter argument list:
+//   <transform reference function,
+//    optimized inverse transform function,
+//    inverse transform reference function,
+//    num_coeffs,
+//    tx_type,
+//    bit_depth>
+typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, int, int> IHbdHtParam;
+
+class AV1HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
+ public:
+  virtual ~AV1HighbdInvHTNxN() {}
+
+  virtual void SetUp() {
+    txfm_ref_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    inv_txfm_ref_ = GET_PARAM(2);
+    num_coeffs_ = GET_PARAM(3);
+    tx_type_ = GET_PARAM(4);
+    bit_depth_ = GET_PARAM(5);
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(input_[0]) * num_coeffs_));
+
+    // Note:
+    // Inverse transform input buffer is 32-byte aligned
+    // Refer to <root>/av1/encoder/context_tree.c, function,
+    // void alloc_mode_context().
+    coeffs_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
+    output_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(32, sizeof(output_[0]) * num_coeffs_));
+    output_ref_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(coeffs_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  int GetStride() const {
+    if (16 == num_coeffs_) {
+      return 4;
+    } else if (64 == num_coeffs_) {
+      return 8;
+    } else if (256 == num_coeffs_) {
+      return 16;
+    } else {
+      return 0;
+    }
+  }
+
+  HbdHtFunc txfm_ref_;
+  IHbdHtFunc inv_txfm_;
+  IHbdHtFunc inv_txfm_ref_;
+  int num_coeffs_;
+  int tx_type_;
+  int bit_depth_;
+
+  int16_t *input_;
+  int32_t *coeffs_;
+  uint16_t *output_;
+  uint16_t *output_ref_;
+};
+
+void AV1HighbdInvHTNxN::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int stride = GetStride();
+  const int num_tests = 20000;
+  const uint16_t mask = (1 << bit_depth_) - 1;
+
+  for (int i = 0; i < num_tests; ++i) {
+    for (int j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+      output_ref_[j] = rnd.Rand16() & mask;
+      output_[j] = output_ref_[j];
+    }
+
+    txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
+    inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        inv_txfm_(coeffs_, output_, stride, tx_type_, bit_depth_));
+
+    for (int j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " At test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdInvHTNxN, InvTransResultCheck) { RunBitexactCheck(); }
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+#define PARAM_LIST_4X4                                   \
+  &av1_fwd_txfm2d_4x4_c, &av1_inv_txfm2d_add_4x4_sse4_1, \
+      &av1_inv_txfm2d_add_4x4_c, 16
+
+#define PARAM_LIST_8X8                                   \
+  &av1_fwd_txfm2d_8x8_c, &av1_inv_txfm2d_add_8x8_sse4_1, \
+      &av1_inv_txfm2d_add_8x8_c, 64
+
+#define PARAM_LIST_16X16                                     \
+  &av1_fwd_txfm2d_16x16_c, &av1_inv_txfm2d_add_16x16_sse4_1, \
+      &av1_inv_txfm2d_add_16x16_c, 256
+
+const IHbdHtParam kArrayIhtParam[] = {
+  // 16x16
+  make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_16X16, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_16X16, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_16X16, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_16X16, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_16X16, ADST_ADST, 12),
+#if CONFIG_EXT_TX
+  make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
+#endif
+  // 8x8
+  make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_8X8, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_8X8, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_8X8, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_8X8, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_8X8, ADST_ADST, 12),
+#if CONFIG_EXT_TX
+  make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
+#endif
+  // 4x4
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 12),
+#if CONFIG_EXT_TX
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
+#endif
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvHTNxN,
+                        ::testing::ValuesIn(kArrayIhtParam));
+#endif  // HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+
+}  // namespace

diff --git a/test/av1_inv_txfm1d_test.cc b/test/av1_inv_txfm1d_test.cc
new file mode 100644
index 0000000..8470fc0
--- /dev/null
+++ b/test/av1_inv_txfm1d_test.cc

@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/av1_txfm_test.h"
+#include "av1/common/av1_fwd_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+
+namespace {
+const int txfm_type_num = 2;
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = { 4, 8, 16, 32, 64 };
+
+const TxfmFunc fwd_txfm_func_ls[2][5] = {
+#if CONFIG_TX64X64
+  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new,
+    av1_fdct64_new },
+#else
+  { av1_fdct4_new, av1_fdct8_new, av1_fdct16_new, av1_fdct32_new, NULL },
+#endif
+  { av1_fadst4_new, av1_fadst8_new, av1_fadst16_new, av1_fadst32_new, NULL }
+};
+
+const TxfmFunc inv_txfm_func_ls[2][5] = {
+#if CONFIG_TX64X64
+  { av1_idct4_new, av1_idct8_new, av1_idct16_new, av1_idct32_new,
+    av1_idct64_new },
+#else
+  { av1_idct4_new, av1_idct8_new, av1_idct16_new, av1_idct32_new, NULL },
+#endif
+  { av1_iadst4_new, av1_iadst8_new, av1_iadst16_new, av1_iadst32_new, NULL }
+};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit[12] = { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14 };
+const int8_t range_bit[12] = { 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 };
+
+TEST(av1_inv_txfm1d, round_trip) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    int32_t *round_trip_output = new int32_t[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[ti][si];
+      TxfmFunc inv_txfm_func = inv_txfm_func_ls[ti][si];
+      int max_error = 2;
+
+      if (fwd_txfm_func != NULL) {
+        const int count_test_block = 5000;
+        for (int ci = 0; ci < count_test_block; ++ci) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+          }
+
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
+
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            int node_err =
+                abs(input[ni] - round_shift(round_trip_output[ni],
+                                            get_max_bit(txfm_size) - 1));
+            EXPECT_LE(node_err, max_error);
+          }
+        }
+      }
+    }
+    delete[] input;
+    delete[] output;
+    delete[] round_trip_output;
+  }
+}
+
+}  // namespace

diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
new file mode 100644
index 0000000..55a745f
--- /dev/null
+++ b/test/av1_inv_txfm2d_test.cc

@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/av1_txfm_test.h"
+#include "av1/common/av1_inv_txfm2d_cfg.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::input_base;
+using libaom_test::bd;
+using libaom_test::compute_avg_abs_error;
+using libaom_test::Fwd_Txfm2d_Func;
+using libaom_test::Inv_Txfm2d_Func;
+
+namespace {
+
+#if CONFIG_AOM_HIGHBITDEPTH
+// AV1InvTxfm2dParam argument list:
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tr1::tuple<TX_TYPE, TX_SIZE, int, double> AV1InvTxfm2dParam;
+
+class AV1InvTxfm2d : public ::testing::TestWithParam<AV1InvTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    txfm1d_size_ = libaom_test::get_txfm1d_size(tx_size_);
+    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
+    count_ = 500;
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(16, sizeof(int32_t) * txfm2d_size_));
+  }
+
+  void RunRoundtripCheck() {
+    const Fwd_Txfm2d_Func fwd_txfm_func =
+        libaom_test::fwd_txfm_func_ls[tx_size_];
+    const Inv_Txfm2d_Func inv_txfm_func =
+        libaom_test::inv_txfm_func_ls[tx_size_];
+    double avg_abs_error = 0;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        if (ci == 0) {
+          int extreme_input = input_base - 1;
+          input_[ni] = extreme_input;  // extreme case
+          ref_input_[ni] = 0;
+        } else {
+          input_[ni] = rnd.Rand16() % input_base;
+          ref_input_[ni] = 0;
+        }
+      }
+
+      fwd_txfm_func(input_, output_, txfm1d_size_, tx_type_, bd);
+      inv_txfm_func(output_, ref_input_, txfm1d_size_, tx_type_, bd);
+
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        EXPECT_GE(max_error_, abs(input_[ni] - ref_input_[ni]));
+      }
+      avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
+          input_, ref_input_, txfm2d_size_);
+    }
+
+    avg_abs_error /= count_;
+    // max_abs_avg_error comes from upper bound of
+    // printf("txfm1d_size: %d accuracy_avg_abs_error: %f\n",
+    // txfm1d_size_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error);
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(ref_input_);
+  }
+
+ private:
+  int count_;
+  int max_error_;
+  double max_avg_error_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int txfm1d_size_;
+  int txfm2d_size_;
+  int16_t *input_;
+  uint16_t *ref_input_;
+  int32_t *output_;
+};
+
+TEST_P(AV1InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
+
+const AV1InvTxfm2dParam av1_inv_txfm2d_param[] = {
+#if CONFIG_EXT_TX
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 0.04),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(FLIPADST_DCT, TX_32X32, 4, 0.4),
+  AV1InvTxfm2dParam(DCT_FLIPADST, TX_32X32, 4, 0.4),
+  AV1InvTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 4, 0.4),
+  AV1InvTxfm2dParam(ADST_FLIPADST, TX_32X32, 4, 0.4),
+  AV1InvTxfm2dParam(FLIPADST_ADST, TX_32X32, 4, 0.4),
+#endif
+  AV1InvTxfm2dParam(DCT_DCT, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(ADST_DCT, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(DCT_ADST, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.002),
+  AV1InvTxfm2dParam(DCT_DCT, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(ADST_DCT, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(DCT_ADST, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(ADST_ADST, TX_8X8, 2, 0.02),
+  AV1InvTxfm2dParam(DCT_DCT, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(ADST_DCT, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(DCT_ADST, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(ADST_ADST, TX_16X16, 2, 0.04),
+  AV1InvTxfm2dParam(DCT_DCT, TX_32X32, 4, 0.4),
+  AV1InvTxfm2dParam(ADST_DCT, TX_32X32, 4, 0.4),
+  AV1InvTxfm2dParam(DCT_ADST, TX_32X32, 4, 0.4),
+  AV1InvTxfm2dParam(ADST_ADST, TX_32X32, 4, 0.4)
+};
+
+INSTANTIATE_TEST_CASE_P(C, AV1InvTxfm2d,
+                        ::testing::ValuesIn(av1_inv_txfm2d_param));
+
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+}  // namespace

diff --git a/test/av1_inv_txfm_test.cc b/test/av1_inv_txfm_test.cc
index 1d7335f..8f6c868 100644
--- a/test/av1_inv_txfm_test.cc
+++ b/test/av1_inv_txfm_test.cc

@@ -24,7 +24,7 @@
 #include "av1/common/blockd.h"
 #include "av1/common/scan.h"
 #include "aom/aom_integer.h"
-#include "av1/common/av1_inv_txfm.h"
+#include "aom_dsp/inv_txfm.h"
 
 using libaom_test::ACMRandom;
 
@@ -104,10 +104,10 @@
 
 INSTANTIATE_TEST_CASE_P(
     C, AV1InvTxfm,
-    ::testing::Values(IdctParam(&av1_idct4_c, &reference_idct_1d, 4, 1),
-                      IdctParam(&av1_idct8_c, &reference_idct_1d, 8, 2),
-                      IdctParam(&av1_idct16_c, &reference_idct_1d, 16, 4),
-                      IdctParam(&av1_idct32_c, &reference_idct_1d, 32, 6)));
+    ::testing::Values(IdctParam(&aom_idct4_c, &reference_idct_1d, 4, 1),
+                      IdctParam(&aom_idct8_c, &reference_idct_1d, 8, 2),
+                      IdctParam(&aom_idct16_c, &reference_idct_1d, 16, 4),
+                      IdctParam(&aom_idct32_c, &reference_idct_1d, 32, 6)));
 
 #if CONFIG_AV1_ENCODER
 typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
@@ -137,6 +137,7 @@
   InvTxfmFunc partial_itxfm_;
 };
 
+#if !CONFIG_ADAPT_SCAN
 TEST_P(AV1PartialIDctTest, RunQuantCheck) {
   int size;
   switch (tx_size_) {
@@ -184,8 +185,9 @@
       // quantization with maximum allowed step sizes
       test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
       for (int j = 1; j < last_nonzero_; ++j)
-        test_coef_block1[av1_default_scan_orders[tx_size_].scan[j]] =
-            (output_ref_block[j] / 1828) * 1828;
+        test_coef_block1[get_scan((const AV1_COMMON *)NULL, tx_size_, DCT_DCT,
+                                  0)
+                             ->scan[j]] = (output_ref_block[j] / 1828) * 1828;
     }
 
     ASM_REGISTER_STATE_CHECK(full_itxfm_(test_coef_block1, dst1, size));
@@ -235,7 +237,8 @@
         max_energy_leftover = 0;
         coef = 0;
       }
-      test_coef_block1[av1_default_scan_orders[tx_size_].scan[j]] = coef;
+      test_coef_block1[get_scan((const AV1_COMMON *)NULL, tx_size_, DCT_DCT, 0)
+                           ->scan[j]] = coef;
     }
 
     memcpy(test_coef_block2, test_coef_block1,
@@ -254,23 +257,24 @@
   EXPECT_EQ(0, max_error)
       << "Error: partial inverse transform produces different results";
 }
+#endif
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
     C, AV1PartialIDctTest,
-    ::testing::Values(make_tuple(&aom_fdct32x32_c, &av1_idct32x32_1024_add_c,
-                                 &av1_idct32x32_34_add_c, TX_32X32, 34),
-                      make_tuple(&aom_fdct32x32_c, &av1_idct32x32_1024_add_c,
-                                 &av1_idct32x32_1_add_c, TX_32X32, 1),
-                      make_tuple(&aom_fdct16x16_c, &av1_idct16x16_256_add_c,
-                                 &av1_idct16x16_10_add_c, TX_16X16, 10),
-                      make_tuple(&aom_fdct16x16_c, &av1_idct16x16_256_add_c,
-                                 &av1_idct16x16_1_add_c, TX_16X16, 1),
-                      make_tuple(&aom_fdct8x8_c, &av1_idct8x8_64_add_c,
-                                 &av1_idct8x8_12_add_c, TX_8X8, 12),
-                      make_tuple(&aom_fdct8x8_c, &av1_idct8x8_64_add_c,
-                                 &av1_idct8x8_1_add_c, TX_8X8, 1),
-                      make_tuple(&aom_fdct4x4_c, &av1_idct4x4_16_add_c,
-                                 &av1_idct4x4_1_add_c, TX_4X4, 1)));
+    ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+                                 &aom_idct32x32_34_add_c, TX_32X32, 34),
+                      make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c,
+                                 &aom_idct32x32_1_add_c, TX_32X32, 1),
+                      make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+                                 &aom_idct16x16_10_add_c, TX_16X16, 10),
+                      make_tuple(&aom_fdct16x16_c, &aom_idct16x16_256_add_c,
+                                 &aom_idct16x16_1_add_c, TX_16X16, 1),
+                      make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+                                 &aom_idct8x8_12_add_c, TX_8X8, 12),
+                      make_tuple(&aom_fdct8x8_c, &aom_idct8x8_64_add_c,
+                                 &aom_idct8x8_1_add_c, TX_8X8, 1),
+                      make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c,
+                                 &aom_idct4x4_1_add_c, TX_4X4, 1)));
 #endif  // CONFIG_AV1_ENCODER
 }  // namespace

diff --git a/test/av1_quantize_test.cc b/test/av1_quantize_test.cc
new file mode 100644
index 0000000..6320090
--- /dev/null
+++ b/test/av1_quantize_test.cc

@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "av1/common/scan.h"
+
+namespace {
+
+typedef void (*QuantizeFpFunc)(
+    const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const int log_scale);
+
+struct QuantizeFuncParams {
+  QuantizeFuncParams(QuantizeFpFunc qF = NULL, QuantizeFpFunc qRefF = NULL,
+                     int count = 16)
+      : qFunc(qF), qFuncRef(qRefF), coeffCount(count) {}
+  QuantizeFpFunc qFunc;
+  QuantizeFpFunc qFuncRef;
+  int coeffCount;
+};
+
+using libaom_test::ACMRandom;
+
+const int numTests = 1000;
+const int maxSize = 1024;
+const int roundFactorRange = 127;
+const int dequantRange = 32768;
+const int coeffRange = (1 << 20) - 1;
+
+class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeFuncParams> {
+ public:
+  void RunQuantizeTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int err_count_total = 0;
+    int first_failure = -1;
+    int skip_block = 0;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+
+    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+    for (int i = 0; i < numTests; i++) {
+      int err_count = 0;
+      ref_eob = eob = -1;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = rnd(coeffRange);
+      }
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+
+      quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+                  quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+      ASM_REGISTER_STATE_CHECK(
+          quanFunc(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+                   quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+                   scanOrder.scan, scanOrder.iscan, log_scale));
+
+      for (int j = 0; j < count; ++j) {
+        err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
+                     (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+        EXPECT_EQ(ref_qcoeff_ptr[j], qcoeff_ptr[j]) << "qcoeff error: i = " << i
+                                                    << " j = " << j << "\n";
+        EXPECT_EQ(ref_dqcoeff_ptr[j], dqcoeff_ptr[j])
+            << "dqcoeff error: i = " << i << " j = " << j << "\n";
+      }
+      EXPECT_EQ(ref_eob, eob) << "eob error: "
+                              << "i = " << i << "\n";
+      err_count += (ref_eob != eob);
+      if (err_count && !err_count_total) {
+        first_failure = i;
+      }
+      err_count_total += err_count;
+    }
+    EXPECT_EQ(0, err_count_total)
+        << "Error: Quantization Test, C output doesn't match SSE2 output. "
+        << "First failed at test case " << first_failure;
+  }
+
+  void RunEobTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
+    DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
+    DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[maxSize]);
+    DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
+    uint16_t eob;
+    uint16_t ref_eob;
+    int skip_block = 0;
+    int count = params_.coeffCount;
+    const TX_SIZE txSize = getTxSize(count);
+    int log_scale = (txSize == TX_32X32);
+    QuantizeFpFunc quanFunc = params_.qFunc;
+    QuantizeFpFunc quanFuncRef = params_.qFuncRef;
+    const SCAN_ORDER scanOrder = av1_default_scan_orders[txSize];
+
+    for (int i = 0; i < numTests; i++) {
+      ref_eob = eob = -1;
+      for (int j = 0; j < count; j++) {
+        coeff_ptr[j] = 0;
+      }
+
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+      coeff_ptr[rnd(count)] = rnd(coeffRange);
+
+      for (int j = 0; j < 2; j++) {
+        zbin_ptr[j] = rnd.Rand16();
+        quant_shift_ptr[j] = rnd.Rand16();
+        // int16_t positive
+        dequant_ptr[j] = abs(rnd(dequantRange));
+        quant_ptr[j] = (1 << 16) / dequant_ptr[j];
+        round_ptr[j] = (abs(rnd(roundFactorRange)) * dequant_ptr[j]) >> 7;
+      }
+
+      quanFuncRef(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+                  quant_shift_ptr, ref_qcoeff_ptr, ref_dqcoeff_ptr, dequant_ptr,
+                  &ref_eob, scanOrder.scan, scanOrder.iscan, log_scale);
+
+      ASM_REGISTER_STATE_CHECK(
+          quanFunc(coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
+                   quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, &eob,
+                   scanOrder.scan, scanOrder.iscan, log_scale));
+      EXPECT_EQ(ref_eob, eob) << "eob error: "
+                              << "i = " << i << "\n";
+    }
+  }
+
+  virtual void SetUp() { params_ = GetParam(); }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  virtual ~AV1QuantizeTest() {}
+
+ private:
+  TX_SIZE getTxSize(int count) {
+    switch (count) {
+      case 16: return TX_4X4;
+      case 64: return TX_8X8;
+      case 256: return TX_16X16;
+      case 1024: return TX_32X32;
+      default: return TX_4X4;
+    }
+  }
+
+  QuantizeFuncParams params_;
+};
+
+TEST_P(AV1QuantizeTest, BitExactCheck) { RunQuantizeTest(); }
+TEST_P(AV1QuantizeTest, EobVerify) { RunEobTest(); }
+
+#if HAVE_SSE4_1
+#if !CONFIG_AOM_QM
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1QuantizeTest,
+    ::testing::Values(QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1,
+                                         &av1_highbd_quantize_fp_c, 16),
+                      QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1,
+                                         &av1_highbd_quantize_fp_c, 64),
+                      QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1,
+                                         &av1_highbd_quantize_fp_c, 256),
+                      QuantizeFuncParams(&av1_highbd_quantize_fp_sse4_1,
+                                         &av1_highbd_quantize_fp_c, 1024)));
+#endif  // !CONFIG_AOM_QM
+#endif  // HAVE_SSE4_1
+}  // namespace

diff --git a/test/av1_txfm_test.cc b/test/av1_txfm_test.cc
new file mode 100644
index 0000000..8dc6321
--- /dev/null
+++ b/test/av1_txfm_test.cc

@@ -0,0 +1,163 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "test/av1_txfm_test.h"
+
+namespace libaom_test {
+
+int get_txfm1d_size(TX_SIZE tx_size) { return 1 << (tx_size + 2); }
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1) {
+  switch (txfm2d_type) {
+    case DCT_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      break;
+    case ADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_ADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_FLIPADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      assert(0);
+      break;
+  }
+}
+
+double invSqrt2 = 1 / pow(2, 0.5);
+
+void reference_dct_1d(const double *in, double *out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0) out[k] = out[k] * invSqrt2;
+  }
+}
+
+void reference_adst_1d(const double *in, double *out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_hybrid_1d(double *in, double *out, int size, int type) {
+  if (type == TYPE_DCT)
+    reference_dct_1d(in, out, size);
+  else
+    reference_adst_1d(in, out, size);
+}
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1) {
+  double *tempOut = new double[size * size];
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = in[c * size + r];
+    }
+  }
+
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type0);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type1);
+  }
+  delete[] tempOut;
+}
+
+template <typename Type>
+void fliplr(Type *dest, int stride, int length) {
+  int i, j;
+  for (i = 0; i < length; ++i) {
+    for (j = 0; j < length / 2; ++j) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + length - 1 - j];
+      dest[i * stride + length - 1 - j] = tmp;
+    }
+  }
+}
+
+template <typename Type>
+void flipud(Type *dest, int stride, int length) {
+  int i, j;
+  for (j = 0; j < length; ++j) {
+    for (i = 0; i < length / 2; ++i) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(length - 1 - i) * stride + j];
+      dest[(length - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+template <typename Type>
+void fliplrud(Type *dest, int stride, int length) {
+  int i, j;
+  for (i = 0; i < length / 2; ++i) {
+    for (j = 0; j < length; ++j) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(length - 1 - i) * stride + length - 1 - j];
+      dest[(length - 1 - i) * stride + length - 1 - j] = tmp;
+    }
+  }
+}
+
+template void fliplr<double>(double *dest, int stride, int length);
+template void flipud<double>(double *dest, int stride, int length);
+template void fliplrud<double>(double *dest, int stride, int length);
+
+}  // namespace libaom_test

diff --git a/test/av1_txfm_test.h b/test/av1_txfm_test.h
new file mode 100644
index 0000000..8f0022d
--- /dev/null
+++ b/test/av1_txfm_test.h

@@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AV1_TXFM_TEST_H_
+#define AV1_TXFM_TEST_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "./av1_rtcd.h"
+
+namespace libaom_test {
+typedef enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} TYPE_TXFM;
+
+int get_txfm1d_size(TX_SIZE tx_size);
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM *type0, TYPE_TXFM *type1);
+
+void reference_dct_1d(const double *in, double *out, int size);
+
+void reference_adst_1d(const double *in, double *out, int size);
+
+void reference_hybrid_1d(double *in, double *out, int size, int type);
+
+void reference_hybrid_2d(double *in, double *out, int size, int type0,
+                         int type1);
+template <typename Type1, typename Type2>
+static double compute_avg_abs_error(const Type1 *a, const Type2 *b,
+                                    const int size) {
+  double error = 0;
+  for (int i = 0; i < size; i++) {
+    error += fabs(static_cast<double>(a[i]) - static_cast<double>(b[i]));
+  }
+  error = error / size;
+  return error;
+}
+
+template <typename Type>
+void fliplr(Type *dest, int stride, int length);
+
+template <typename Type>
+void flipud(Type *dest, int stride, int length);
+
+template <typename Type>
+void fliplrud(Type *dest, int stride, int length);
+
+typedef void (*TxfmFunc)(const int32_t *in, int32_t *out, const int8_t *cos_bit,
+                         const int8_t *range_bit);
+
+typedef void (*Fwd_Txfm2d_Func)(const int16_t *, int32_t *, int, int, int);
+typedef void (*Inv_Txfm2d_Func)(const int32_t *, uint16_t *, int, int, int);
+
+static const int bd = 10;
+static const int input_base = (1 << bd);
+
+#if CONFIG_AOM_HIGHBITDEPTH
+static const Fwd_Txfm2d_Func fwd_txfm_func_ls[TX_SIZES] = {
+  av1_fwd_txfm2d_4x4_c, av1_fwd_txfm2d_8x8_c, av1_fwd_txfm2d_16x16_c,
+  av1_fwd_txfm2d_32x32_c
+};
+
+static const Inv_Txfm2d_Func inv_txfm_func_ls[TX_SIZES] = {
+  av1_inv_txfm2d_add_4x4_c, av1_inv_txfm2d_add_8x8_c,
+  av1_inv_txfm2d_add_16x16_c, av1_inv_txfm2d_add_32x32_c
+};
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+}  // namespace libaom_test
+#endif  // AV1_TXFM_TEST_H_

diff --git a/test/av1_wedge_utils_test.cc b/test/av1_wedge_utils_test.cc
new file mode 100644
index 0000000..539e9ef
--- /dev/null
+++ b/test/av1_wedge_utils_test.cc

@@ -0,0 +1,382 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_config.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "./av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/enums.h"
+
+#include "test/acm_random.h"
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#define WEDGE_WEIGHT_BITS 6
+#define MAX_MASK_VALUE (1 << (WEDGE_WEIGHT_BITS))
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - functionality
+//////////////////////////////////////////////////////////////////////////////
+
+class WedgeUtilsSSEFuncTest : public testing::Test {
+ protected:
+  WedgeUtilsSSEFuncTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  static const int kIterations = 1000;
+
+  ACMRandom rng_;
+};
+
+static void equiv_blend_residuals(int16_t *r, const int16_t *r0,
+                                  const int16_t *r1, const uint8_t *m, int N) {
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    // Note that this rounding is designed to match the result
+    // you would get when actually blending the 2 predictors and computing
+    // the residuals.
+    r[i] = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+  }
+}
+
+static uint64_t equiv_sse_from_residuals(const int16_t *r0, const int16_t *r1,
+                                         const uint8_t *m, int N) {
+  uint64_t acc = 0;
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    const int32_t r = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+    acc += r * r;
+  }
+  return acc;
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) {
+  DECLARE_ALIGNED(32, uint8_t, s[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p[MAX_SB_SQUARE]);
+
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_tst[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      s[i] = rng_.Rand8();
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int w = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int h = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int N = w * h;
+
+    for (int j = 0; j < N; j++) {
+      p0[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+      p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+    }
+
+    aom_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
+
+    aom_subtract_block(h, w, r0, w, s, w, p0, w);
+    aom_subtract_block(h, w, r1, w, s, w, p1, w);
+
+    aom_subtract_block(h, w, r_ref, w, s, w, p, w);
+    equiv_blend_residuals(r_tst, r0, r1, m, N);
+
+    for (int i = 0; i < N; ++i) ASSERT_EQ(r_ref[i], r_tst[i]);
+
+    uint64_t ref_sse = aom_sum_squares_i16(r_ref, N);
+    uint64_t tst_sse = equiv_sse_from_residuals(r0, r1, m, N);
+
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+static uint64_t sse_from_residuals(const int16_t *r0, const int16_t *r1,
+                                   const uint8_t *m, int N) {
+  uint64_t acc = 0;
+  for (int i = 0; i < N; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int32_t r = m0 * r0[i] + m1 * r1[i];
+    acc += r * r;
+  }
+  return ROUND_POWER_OF_TWO(acc, 2 * WEDGE_WEIGHT_BITS);
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingMethod) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r1[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      d[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    for (int i = 0; i < N; i++) r0[i] = r1[i] + d[i];
+
+    const uint64_t ref_res = sse_from_residuals(r0, r1, m, N);
+    const uint64_t tst_res = av1_wedge_sse_from_residuals(r1, d, m, N);
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sse_from_residuals - optimizations
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*FSSE)(const int16_t *r1, const int16_t *d, const uint8_t *m,
+                         int N);
+typedef libaom_test::FuncParam<FSSE> TestFuncsFSSE;
+
+class WedgeUtilsSSEOptTest : public FunctionEquivalenceTest<FSSE> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      d[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) r1[i] = -kInt13Max;
+    }
+
+    if (rng_(2)) {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < MAX_SB_SQUARE; ++i) d[i] = -kInt13Max;
+    }
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSSEOptTest,
+    ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c,
+                                    av1_wedge_sse_from_residuals_sse2)));
+
+#endif  // HAVE_SSE2
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_sign_from_residuals
+//////////////////////////////////////////////////////////////////////////////
+
+typedef int (*FSign)(const int16_t *ds, const uint8_t *m, int N, int64_t limit);
+typedef libaom_test::FuncParam<FSign> TestFuncsFSign;
+
+class WedgeUtilsSignOptTest : public FunctionEquivalenceTest<FSign> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxSize = 8196;  // Size limited by SIMD implementation.
+};
+
+TEST_P(WedgeUtilsSignOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      r0[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)aom_sum_squares_i16(r0, N);
+    limit -= (int64_t)aom_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0; i < N; i++)
+      ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSignOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(4)) {
+      case 0:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = 0;
+          r1[i] = kInt13Max;
+        }
+        break;
+      case 1:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = kInt13Max;
+          r1[i] = 0;
+        }
+        break;
+      case 2:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = 0;
+          r1[i] = -kInt13Max;
+        }
+        break;
+      default:
+        for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+          r0[i] = -kInt13Max;
+          r1[i] = 0;
+        }
+        break;
+    }
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) m[i] = MAX_MASK_VALUE;
+
+    const int maxN = AOMMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN / 64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)aom_sum_squares_i16(r0, N);
+    limit -= (int64_t)aom_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0; i < N; i++)
+      ds[i] = clamp(r0[i] * r0[i] - r1[i] * r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSignOptTest,
+    ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c,
+                                     av1_wedge_sign_from_residuals_sse2)));
+
+#endif  // HAVE_SSE2
+
+//////////////////////////////////////////////////////////////////////////////
+// av1_wedge_compute_delta_squares
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FDS)(int16_t *d, const int16_t *a, const int16_t *b, int N);
+typedef libaom_test::FuncParam<FDS> TestFuncsFDS;
+
+class WedgeUtilsDeltaSquaresOptTest : public FunctionEquivalenceTest<FDS> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, b[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_tst[MAX_SB_SQUARE]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) {
+      a[i] = rng_.Rand16();
+      b[i] = rng_(2 * INT16_MAX + 1) - INT16_MAX;
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE / 64) + 1);
+
+    memset(&d_ref, INT16_MAX, sizeof(d_ref));
+    memset(&d_tst, INT16_MAX, sizeof(d_tst));
+
+    params_.ref_func(d_ref, a, b, N);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
+
+    for (int i = 0; i < MAX_SB_SQUARE; ++i) ASSERT_EQ(d_ref[i], d_tst[i]);
+  }
+}
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(TestFuncsFDS(av1_wedge_compute_delta_squares_c,
+                                   av1_wedge_compute_delta_squares_sse2)));
+
+#endif  // HAVE_SSE2
+
+}  // namespace

diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
index 47e0241..684cbed 100644
--- a/test/blend_a64_mask_1d_test.cc
+++ b/test/blend_a64_mask_1d_test.cc

@@ -93,16 +93,16 @@
 
   T dst_ref_[kBufSize];
   T dst_tst_[kBufSize];
-  size_t dst_stride_;
-  size_t dst_offset_;
+  uint32_t dst_stride_;
+  uint32_t dst_offset_;
 
   T src0_[kBufSize];
-  size_t src0_stride_;
-  size_t src0_offset_;
+  uint32_t src0_stride_;
+  uint32_t src0_offset_;
 
   T src1_[kBufSize];
-  size_t src1_stride_;
-  size_t src1_offset_;
+  uint32_t src1_stride_;
+  uint32_t src1_offset_;
 
   uint8_t mask_[kMaxMaskSize];
 

diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
index cd937cb..4fc59f4 100644
--- a/test/blend_a64_mask_test.cc
+++ b/test/blend_a64_mask_test.cc

@@ -99,16 +99,16 @@
 
   T dst_ref_[kBufSize];
   T dst_tst_[kBufSize];
-  size_t dst_stride_;
-  size_t dst_offset_;
+  uint32_t dst_stride_;
+  uint32_t dst_offset_;
 
   T src0_[kBufSize];
-  size_t src0_stride_;
-  size_t src0_offset_;
+  uint32_t src0_stride_;
+  uint32_t src0_offset_;
 
   T src1_[kBufSize];
-  size_t src1_stride_;
-  size_t src1_offset_;
+  uint32_t src1_stride_;
+  uint32_t src1_offset_;
 
   uint8_t mask_[kMaxMaskSize];
   size_t mask_stride_;
@@ -116,8 +116,8 @@
   int w_;
   int h_;
 
-  bool suby_;
-  bool subx_;
+  int suby_;
+  int subx_;
 };
 
 //////////////////////////////////////////////////////////////////////////////

diff --git a/test/boolcoder_test.cc b/test/boolcoder_test.cc
index 13116ae..c6a3288 100644
--- a/test/boolcoder_test.cc
+++ b/test/boolcoder_test.cc

@@ -35,27 +35,18 @@
 
       for (int i = 0; i < kBitsToTest; ++i) {
         const int parity = i & 1;
+        /* clang-format off */
         probas[i] =
-            (method == 0)
-                ? 0
-                : (method == 1)
-                      ? 255
-                      : (method == 2)
-                            ? 128
-                            : (method == 3)
-                                  ? rnd.Rand8()
-                                  : (method == 4)
-                                        ? (parity ? 0 : 255)
-                                        :
-                                        // alternate between low and high proba:
-                                        (method == 5)
-                                            ? (parity ? rnd(128)
-                                                      : 255 - rnd(128))
-                                            : (method == 6)
-                                                  ? (parity ? rnd(64)
-                                                            : 255 - rnd(64))
-                                                  : (parity ? rnd(32)
-                                                            : 255 - rnd(32));
+          (method == 0) ? 0 : (method == 1) ? 255 :
+          (method == 2) ? 128 :
+          (method == 3) ? rnd.Rand8() :
+          (method == 4) ? (parity ? 0 : 255) :
+            // alternate between low and high proba:
+            (method == 5) ? (parity ? rnd(128) : 255 - rnd(128)) :
+            (method == 6) ?
+            (parity ? rnd(64) : 255 - rnd(64)) :
+            (parity ? rnd(32) : 255 - rnd(32));
+        /* clang-format on */
       }
       for (int bit_method = 0; bit_method <= 3; ++bit_method) {
         const int random_seed = 6432;
@@ -142,11 +133,9 @@
           fabs(((tell_frac - last_tell_frac) / 8.0) + log2(probability));
       last_tell_frac = tell_frac;
     }
-    const uint32_t expected = -kSymbols * log2(probability);
-    if (last_tell > expected) {
-      // Last tell should be close to the expected value.
-      GTEST_ASSERT_LE(last_tell - expected, 20u) << " last_tell: " << last_tell;
-    }
+    const uint32_t expected = (uint32_t)(-kSymbols * log2(probability));
+    // Last tell should be close to the expected value.
+    GTEST_ASSERT_LE(last_tell, expected + 20) << " last_tell: " << last_tell;
     // The average frac_diff error should be pretty small.
     GTEST_ASSERT_LE(frac_diff_total / kSymbols, FRAC_DIFF_TOTAL_ERROR)
         << " frac_diff_total: " << frac_diff_total;

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 1961439..837b282 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -14,14 +14,11 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./aom_config.h"
-#include "./av1_rtcd.h"
 #include "./aom_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "av1/common/common.h"
-#include "av1/common/filter.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_mem/aom_mem.h"
@@ -29,7 +26,7 @@
 
 namespace {
 
-static const unsigned int kMaxDimension = 64;
+static const unsigned int kMaxDimension = MAX_SB_SIZE;
 
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
@@ -68,6 +65,27 @@
 
 typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+#define ALL_SIZES(convolve_fn)                                            \
+  make_tuple(128, 64, &convolve_fn), make_tuple(64, 128, &convolve_fn),   \
+      make_tuple(128, 128, &convolve_fn), make_tuple(4, 4, &convolve_fn), \
+      make_tuple(8, 4, &convolve_fn), make_tuple(4, 8, &convolve_fn),     \
+      make_tuple(8, 8, &convolve_fn), make_tuple(16, 8, &convolve_fn),    \
+      make_tuple(8, 16, &convolve_fn), make_tuple(16, 16, &convolve_fn),  \
+      make_tuple(32, 16, &convolve_fn), make_tuple(16, 32, &convolve_fn), \
+      make_tuple(32, 32, &convolve_fn), make_tuple(64, 32, &convolve_fn), \
+      make_tuple(32, 64, &convolve_fn), make_tuple(64, 64, &convolve_fn)
+#else
+#define ALL_SIZES(convolve_fn)                                            \
+  make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
+      make_tuple(4, 8, &convolve_fn), make_tuple(8, 8, &convolve_fn),     \
+      make_tuple(16, 8, &convolve_fn), make_tuple(8, 16, &convolve_fn),   \
+      make_tuple(16, 16, &convolve_fn), make_tuple(32, 16, &convolve_fn), \
+      make_tuple(16, 32, &convolve_fn), make_tuple(32, 32, &convolve_fn), \
+      make_tuple(64, 32, &convolve_fn), make_tuple(32, 64, &convolve_fn), \
+      make_tuple(64, 64, &convolve_fn)
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define AV1_FILTER_WEIGHT 128
 #define AV1_FILTER_SHIFT 7
@@ -93,7 +111,7 @@
   //                               = 23
   // and filter_max_width          = 16
   //
-  uint8_t intermediate_buffer[71 * kMaxDimension];
+  uint8_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension];
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
@@ -162,9 +180,10 @@
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
                      output_width, output_height);
-  block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width, output_height);
+  block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride, output_width,
+                    output_height);
 }
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -189,7 +208,7 @@
    *                               = 23
    * and filter_max_width = 16
    */
-  uint16_t intermediate_buffer[71 * kMaxDimension];
+  uint16_t intermediate_buffer[(kMaxDimension + 8) * kMaxDimension];
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
@@ -265,10 +284,10 @@
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
-                            output_width, output_height, bd);
-  highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride, output_width,
-                           output_height);
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp,
+                            kMaxDimension, output_width, output_height, bd);
+  highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
+                           output_width, output_height);
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
@@ -315,7 +334,7 @@
 
  protected:
   static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 256;
+  static const int kOuterBlockSize = 4 * kMaxDimension;
   static const int kInputStride = kOuterBlockSize;
   static const int kOutputStride = kOuterBlockSize;
   static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
@@ -390,41 +409,41 @@
   }
 
   uint8_t *input() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_AOM_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return input_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(input16_) + BorderTop() * kOuterBlockSize +
-             BorderLeft();
+      return CONVERT_TO_BYTEPTR(input16_) + offset;
     }
 #else
-    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return input_ + offset;
 #endif
   }
 
   uint8_t *output() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_AOM_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_) + BorderTop() * kOuterBlockSize +
-             BorderLeft();
+      return CONVERT_TO_BYTEPTR(output16_) + offset;
     }
 #else
-    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ + offset;
 #endif
   }
 
   uint8_t *output_ref() const {
+    const int offset = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_AOM_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ref_ + offset;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_) + BorderTop() * kOuterBlockSize +
-             BorderLeft();
+      return CONVERT_TO_BYTEPTR(output16_ref_) + offset;
     }
 #else
-    return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ref_ + offset;
 #endif
   }
 
@@ -968,16 +987,6 @@
     wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
     wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
     wrap_convolve8_avg_c_8, 8);
-INSTANTIATE_TEST_CASE_P(
-    C_8, ConvolveTest,
-    ::testing::Values(
-        make_tuple(4, 4, &convolve8_c), make_tuple(8, 4, &convolve8_c),
-        make_tuple(4, 8, &convolve8_c), make_tuple(8, 8, &convolve8_c),
-        make_tuple(16, 8, &convolve8_c), make_tuple(8, 16, &convolve8_c),
-        make_tuple(16, 16, &convolve8_c), make_tuple(32, 16, &convolve8_c),
-        make_tuple(16, 32, &convolve8_c), make_tuple(32, 32, &convolve8_c),
-        make_tuple(64, 32, &convolve8_c), make_tuple(32, 64, &convolve8_c),
-        make_tuple(64, 64, &convolve8_c)));
 const ConvolveFunctions convolve10_c(
     wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
     wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
@@ -985,16 +994,6 @@
     wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
     wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
     wrap_convolve8_avg_c_10, 10);
-INSTANTIATE_TEST_CASE_P(
-    C_10, ConvolveTest,
-    ::testing::Values(
-        make_tuple(4, 4, &convolve10_c), make_tuple(8, 4, &convolve10_c),
-        make_tuple(4, 8, &convolve10_c), make_tuple(8, 8, &convolve10_c),
-        make_tuple(16, 8, &convolve10_c), make_tuple(8, 16, &convolve10_c),
-        make_tuple(16, 16, &convolve10_c), make_tuple(32, 16, &convolve10_c),
-        make_tuple(16, 32, &convolve10_c), make_tuple(32, 32, &convolve10_c),
-        make_tuple(64, 32, &convolve10_c), make_tuple(32, 64, &convolve10_c),
-        make_tuple(64, 64, &convolve10_c)));
 const ConvolveFunctions convolve12_c(
     wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
     wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
@@ -1002,37 +1001,20 @@
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
     wrap_convolve8_avg_c_12, 12);
-INSTANTIATE_TEST_CASE_P(
-    C_12, ConvolveTest,
-    ::testing::Values(
-        make_tuple(4, 4, &convolve12_c), make_tuple(8, 4, &convolve12_c),
-        make_tuple(4, 8, &convolve12_c), make_tuple(8, 8, &convolve12_c),
-        make_tuple(16, 8, &convolve12_c), make_tuple(8, 16, &convolve12_c),
-        make_tuple(16, 16, &convolve12_c), make_tuple(32, 16, &convolve12_c),
-        make_tuple(16, 32, &convolve12_c), make_tuple(32, 32, &convolve12_c),
-        make_tuple(64, 32, &convolve12_c), make_tuple(32, 64, &convolve12_c),
-        make_tuple(64, 64, &convolve12_c)));
+const ConvolveParam kArrayConvolve_c[] = {
+  ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
+};
 
 #else
-
 const ConvolveFunctions convolve8_c(
     aom_convolve_copy_c, aom_convolve_avg_c, aom_convolve8_horiz_c,
     aom_convolve8_avg_horiz_c, aom_convolve8_vert_c, aom_convolve8_avg_vert_c,
     aom_convolve8_c, aom_convolve8_avg_c, aom_scaled_horiz_c,
     aom_scaled_avg_horiz_c, aom_scaled_vert_c, aom_scaled_avg_vert_c,
     aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
-
-INSTANTIATE_TEST_CASE_P(
-    C, ConvolveTest,
-    ::testing::Values(
-        make_tuple(4, 4, &convolve8_c), make_tuple(8, 4, &convolve8_c),
-        make_tuple(4, 8, &convolve8_c), make_tuple(8, 8, &convolve8_c),
-        make_tuple(16, 8, &convolve8_c), make_tuple(8, 16, &convolve8_c),
-        make_tuple(16, 16, &convolve8_c), make_tuple(32, 16, &convolve8_c),
-        make_tuple(16, 32, &convolve8_c), make_tuple(32, 32, &convolve8_c),
-        make_tuple(64, 32, &convolve8_c), make_tuple(32, 64, &convolve8_c),
-        make_tuple(64, 64, &convolve8_c)));
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c) };
 #endif
+INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::ValuesIn(kArrayConvolve_c));
 
 #if HAVE_SSE2 && ARCH_X86_64
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -1060,40 +1042,9 @@
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
-INSTANTIATE_TEST_CASE_P(
-    SSE2, ConvolveTest,
-    ::testing::Values(
-        make_tuple(4, 4, &convolve8_sse2), make_tuple(8, 4, &convolve8_sse2),
-        make_tuple(4, 8, &convolve8_sse2), make_tuple(8, 8, &convolve8_sse2),
-        make_tuple(16, 8, &convolve8_sse2), make_tuple(8, 16, &convolve8_sse2),
-        make_tuple(16, 16, &convolve8_sse2),
-        make_tuple(32, 16, &convolve8_sse2),
-        make_tuple(16, 32, &convolve8_sse2),
-        make_tuple(32, 32, &convolve8_sse2),
-        make_tuple(64, 32, &convolve8_sse2),
-        make_tuple(32, 64, &convolve8_sse2),
-        make_tuple(64, 64, &convolve8_sse2), make_tuple(4, 4, &convolve10_sse2),
-        make_tuple(8, 4, &convolve10_sse2), make_tuple(4, 8, &convolve10_sse2),
-        make_tuple(8, 8, &convolve10_sse2), make_tuple(16, 8, &convolve10_sse2),
-        make_tuple(8, 16, &convolve10_sse2),
-        make_tuple(16, 16, &convolve10_sse2),
-        make_tuple(32, 16, &convolve10_sse2),
-        make_tuple(16, 32, &convolve10_sse2),
-        make_tuple(32, 32, &convolve10_sse2),
-        make_tuple(64, 32, &convolve10_sse2),
-        make_tuple(32, 64, &convolve10_sse2),
-        make_tuple(64, 64, &convolve10_sse2),
-        make_tuple(4, 4, &convolve12_sse2), make_tuple(8, 4, &convolve12_sse2),
-        make_tuple(4, 8, &convolve12_sse2), make_tuple(8, 8, &convolve12_sse2),
-        make_tuple(16, 8, &convolve12_sse2),
-        make_tuple(8, 16, &convolve12_sse2),
-        make_tuple(16, 16, &convolve12_sse2),
-        make_tuple(32, 16, &convolve12_sse2),
-        make_tuple(16, 32, &convolve12_sse2),
-        make_tuple(32, 32, &convolve12_sse2),
-        make_tuple(64, 32, &convolve12_sse2),
-        make_tuple(32, 64, &convolve12_sse2),
-        make_tuple(64, 64, &convolve12_sse2)));
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2),
+                                              ALL_SIZES(convolve10_sse2),
+                                              ALL_SIZES(convolve12_sse2) };
 #else
 const ConvolveFunctions convolve8_sse2(
     aom_convolve_copy_sse2, aom_convolve_avg_sse2, aom_convolve8_horiz_sse2,
@@ -1102,21 +1053,10 @@
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
-                        ::testing::Values(make_tuple(4, 4, &convolve8_sse2),
-                                          make_tuple(8, 4, &convolve8_sse2),
-                                          make_tuple(4, 8, &convolve8_sse2),
-                                          make_tuple(8, 8, &convolve8_sse2),
-                                          make_tuple(16, 8, &convolve8_sse2),
-                                          make_tuple(8, 16, &convolve8_sse2),
-                                          make_tuple(16, 16, &convolve8_sse2),
-                                          make_tuple(32, 16, &convolve8_sse2),
-                                          make_tuple(16, 32, &convolve8_sse2),
-                                          make_tuple(32, 32, &convolve8_sse2),
-                                          make_tuple(64, 32, &convolve8_sse2),
-                                          make_tuple(32, 64, &convolve8_sse2),
-                                          make_tuple(64, 64, &convolve8_sse2)));
+const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) };
 #endif  // CONFIG_AOM_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_sse2));
 #endif
 
 #if HAVE_SSSE3
@@ -1125,23 +1065,11 @@
     aom_convolve8_avg_horiz_ssse3, aom_convolve8_vert_ssse3,
     aom_convolve8_avg_vert_ssse3, aom_convolve8_ssse3, aom_convolve8_avg_ssse3,
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+    aom_scaled_avg_vert_c, aom_scaled_2d_ssse3, aom_scaled_avg_2d_c, 0);
 
+const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
-                        ::testing::Values(make_tuple(4, 4, &convolve8_ssse3),
-                                          make_tuple(8, 4, &convolve8_ssse3),
-                                          make_tuple(4, 8, &convolve8_ssse3),
-                                          make_tuple(8, 8, &convolve8_ssse3),
-                                          make_tuple(16, 8, &convolve8_ssse3),
-                                          make_tuple(8, 16, &convolve8_ssse3),
-                                          make_tuple(16, 16, &convolve8_ssse3),
-                                          make_tuple(32, 16, &convolve8_ssse3),
-                                          make_tuple(16, 32, &convolve8_ssse3),
-                                          make_tuple(32, 32, &convolve8_ssse3),
-                                          make_tuple(64, 32, &convolve8_ssse3),
-                                          make_tuple(32, 64, &convolve8_ssse3),
-                                          make_tuple(64, 64,
-                                                     &convolve8_ssse3)));
+                        ::testing::ValuesIn(kArrayConvolve8_ssse3));
 #endif
 
 #if HAVE_AVX2 && HAVE_SSSE3
@@ -1152,23 +1080,13 @@
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 
+const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
 INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
-                        ::testing::Values(make_tuple(4, 4, &convolve8_avx2),
-                                          make_tuple(8, 4, &convolve8_avx2),
-                                          make_tuple(4, 8, &convolve8_avx2),
-                                          make_tuple(8, 8, &convolve8_avx2),
-                                          make_tuple(8, 16, &convolve8_avx2),
-                                          make_tuple(16, 8, &convolve8_avx2),
-                                          make_tuple(16, 16, &convolve8_avx2),
-                                          make_tuple(32, 16, &convolve8_avx2),
-                                          make_tuple(16, 32, &convolve8_avx2),
-                                          make_tuple(32, 32, &convolve8_avx2),
-                                          make_tuple(64, 32, &convolve8_avx2),
-                                          make_tuple(32, 64, &convolve8_avx2),
-                                          make_tuple(64, 64, &convolve8_avx2)));
+                        ::testing::ValuesIn(kArrayConvolve8_avx2));
 #endif  // HAVE_AVX2 && HAVE_SSSE3
 
-#if HAVE_NEON
+// TODO(any): Make NEON versions support 128x128 128x64 64x128 block sizes
+#if HAVE_NEON && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
 #if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
     aom_convolve_copy_neon, aom_convolve_avg_neon, aom_convolve8_horiz_neon,
@@ -1185,23 +1103,13 @@
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 #endif  // HAVE_NEON_ASM
 
+const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES(convolve8_neon) };
 INSTANTIATE_TEST_CASE_P(NEON, ConvolveTest,
-                        ::testing::Values(make_tuple(4, 4, &convolve8_neon),
-                                          make_tuple(8, 4, &convolve8_neon),
-                                          make_tuple(4, 8, &convolve8_neon),
-                                          make_tuple(8, 8, &convolve8_neon),
-                                          make_tuple(16, 8, &convolve8_neon),
-                                          make_tuple(8, 16, &convolve8_neon),
-                                          make_tuple(16, 16, &convolve8_neon),
-                                          make_tuple(32, 16, &convolve8_neon),
-                                          make_tuple(16, 32, &convolve8_neon),
-                                          make_tuple(32, 32, &convolve8_neon),
-                                          make_tuple(64, 32, &convolve8_neon),
-                                          make_tuple(32, 64, &convolve8_neon),
-                                          make_tuple(64, 64, &convolve8_neon)));
+                        ::testing::ValuesIn(kArrayConvolve8_neon));
 #endif  // HAVE_NEON
 
-#if HAVE_DSPR2
+// TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes
+#if HAVE_DSPR2 && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_dspr2(
     aom_convolve_copy_dspr2, aom_convolve_avg_dspr2, aom_convolve8_horiz_dspr2,
     aom_convolve8_avg_horiz_dspr2, aom_convolve8_vert_dspr2,
@@ -1209,24 +1117,13 @@
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 
+const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES(convolve8_dspr2) };
 INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
-                        ::testing::Values(make_tuple(4, 4, &convolve8_dspr2),
-                                          make_tuple(8, 4, &convolve8_dspr2),
-                                          make_tuple(4, 8, &convolve8_dspr2),
-                                          make_tuple(8, 8, &convolve8_dspr2),
-                                          make_tuple(16, 8, &convolve8_dspr2),
-                                          make_tuple(8, 16, &convolve8_dspr2),
-                                          make_tuple(16, 16, &convolve8_dspr2),
-                                          make_tuple(32, 16, &convolve8_dspr2),
-                                          make_tuple(16, 32, &convolve8_dspr2),
-                                          make_tuple(32, 32, &convolve8_dspr2),
-                                          make_tuple(64, 32, &convolve8_dspr2),
-                                          make_tuple(32, 64, &convolve8_dspr2),
-                                          make_tuple(64, 64,
-                                                     &convolve8_dspr2)));
-#endif
+                        ::testing::ValuesIn(kArrayConvolve8_dspr2));
+#endif  // HAVE_DSPR2
 
-#if HAVE_MSA
+// TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes
+#if HAVE_MSA && !(CONFIG_AV1 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_msa(
     aom_convolve_copy_msa, aom_convolve_avg_msa, aom_convolve8_horiz_msa,
     aom_convolve8_avg_horiz_msa, aom_convolve8_vert_msa,
@@ -1234,15 +1131,8 @@
     aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
     aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
 
-INSTANTIATE_TEST_CASE_P(
-    MSA, ConvolveTest,
-    ::testing::Values(
-        make_tuple(4, 4, &convolve8_msa), make_tuple(8, 4, &convolve8_msa),
-        make_tuple(4, 8, &convolve8_msa), make_tuple(8, 8, &convolve8_msa),
-        make_tuple(16, 8, &convolve8_msa), make_tuple(8, 16, &convolve8_msa),
-        make_tuple(16, 16, &convolve8_msa), make_tuple(32, 16, &convolve8_msa),
-        make_tuple(16, 32, &convolve8_msa), make_tuple(32, 32, &convolve8_msa),
-        make_tuple(64, 32, &convolve8_msa), make_tuple(32, 64, &convolve8_msa),
-        make_tuple(64, 64, &convolve8_msa)));
+const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
+INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
 }  // namespace

diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index f09880d..ae98852 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc

@@ -26,7 +26,8 @@
  protected:
   CpuSpeedTest()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
-        set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR) {}
+        set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
+        tune_content_(AOM_CONTENT_DEFAULT) {}
   virtual ~CpuSpeedTest() {}
 
   virtual void SetUp() {
@@ -47,6 +48,7 @@
                                   ::libaom_test::Encoder *encoder) {
     if (video->frame() == 1) {
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(AV1E_SET_TUNE_CONTENT, tune_content_);
       if (encoding_mode_ != ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
@@ -60,24 +62,31 @@
     if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
   }
 
+  void TestQ0();
+  void TestScreencastQ0();
+  void TestTuneScreen();
+  void TestEncodeHighBitrate();
+  void TestLowBitrate();
+
   ::libaom_test::TestMode encoding_mode_;
   int set_cpu_used_;
   double min_psnr_;
+  int tune_content_;
 };
 
-TEST_P(CpuSpeedTest, TestQ0) {
+void CpuSpeedTest::TestQ0() {
   // Validate that this non multiple of 64 wide clip encodes and decodes
   // without a mismatch when passing in a very low max q.  This pushes
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 400;
   cfg_.rc_max_quantizer = 0;
   cfg_.rc_min_quantizer = 0;
 
   ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
+                                       10);
 
   init_flags_ = AOM_CODEC_USE_PSNR;
 
@@ -85,11 +94,11 @@
   EXPECT_GE(min_psnr_, kMaxPSNR);
 }
 
-TEST_P(CpuSpeedTest, TestScreencastQ0) {
-  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+void CpuSpeedTest::TestScreencastQ0() {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 10);
   cfg_.g_timebase = video.timebase();
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 400;
   cfg_.rc_max_quantizer = 0;
   cfg_.rc_min_quantizer = 0;
@@ -100,40 +109,73 @@
   EXPECT_GE(min_psnr_, kMaxPSNR);
 }
 
-TEST_P(CpuSpeedTest, TestEncodeHighBitrate) {
+void CpuSpeedTest::TestTuneScreen() {
+  ::libaom_test::Y4mVideoSource video("screendata.y4m", 0, 10);
+  cfg_.g_timebase = video.timebase();
+  cfg_.rc_2pass_vbr_minsection_pct = 5;
+  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_target_bitrate = 2000;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  tune_content_ = AOM_CONTENT_SCREEN;
+
+  init_flags_ = AOM_CODEC_USE_PSNR;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+void CpuSpeedTest::TestEncodeHighBitrate() {
   // Validate that this non multiple of 64 wide clip encodes and decodes
   // without a mismatch when passing in a very low max q.  This pushes
   // the encoder to producing lots of big partitions which will likely
   // extend into the border and test the border condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 12000;
   cfg_.rc_max_quantizer = 10;
   cfg_.rc_min_quantizer = 0;
 
   ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
+                                       10);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
-TEST_P(CpuSpeedTest, TestLowBitrate) {
+void CpuSpeedTest::TestLowBitrate() {
   // Validate that this clip encodes and decodes without a mismatch
   // when passing in a very high min q.  This pushes the encoder to producing
   // lots of small partitions which might will test the other condition.
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 200;
   cfg_.rc_min_quantizer = 40;
 
   ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
+                                       10);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
 
+TEST_P(CpuSpeedTest, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTest, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTest, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTest, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTest, TestLowBitrate) { TestLowBitrate(); }
+
+class CpuSpeedTestLarge : public CpuSpeedTest {};
+
+TEST_P(CpuSpeedTestLarge, TestQ0) { TestQ0(); }
+TEST_P(CpuSpeedTestLarge, TestScreencastQ0) { TestScreencastQ0(); }
+TEST_P(CpuSpeedTestLarge, TestTuneScreen) { TestTuneScreen(); }
+TEST_P(CpuSpeedTestLarge, TestEncodeHighBitrate) { TestEncodeHighBitrate(); }
+TEST_P(CpuSpeedTestLarge, TestLowBitrate) { TestLowBitrate(); }
+
 AV1_INSTANTIATE_TEST_CASE(CpuSpeedTest,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(0, 3));
+                          ::testing::Range(1, 3));
+AV1_INSTANTIATE_TEST_CASE(CpuSpeedTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 1));
 }  // namespace

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 947c424..0f66440 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc

@@ -22,250 +22,13 @@
 
 class DatarateTestLarge
     : public ::libaom_test::EncoderTest,
-      public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
+      public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int> {
  public:
   DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
 
+ protected:
   virtual ~DatarateTestLarge() {}
 
- protected:
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    ResetModel();
-  }
-
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    first_drop_ = 0;
-    bits_total_ = 0;
-    duration_ = 0.0;
-    denoiser_offon_test_ = 0;
-    denoiser_offon_period_ = -1;
-  }
-
-  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
-                                  ::libaom_test::Encoder *encoder) {
-    if (video->frame() == 0)
-      encoder->Control(AOME_SET_NOISE_SENSITIVITY, denoiser_on_);
-
-    if (denoiser_offon_test_) {
-      ASSERT_GT(denoiser_offon_period_, 0)
-          << "denoiser_offon_period_ is not positive.";
-      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
-        // Flip denoiser_on_ periodically
-        denoiser_on_ ^= 1;
-      }
-      encoder->Control(AOME_SET_NOISE_SENSITIVITY, denoiser_on_);
-    }
-
-    const aom_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-
-    // TODO(jimbankoski): Remove these lines when the issue:
-    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
-    // For now the codec assumes buffer starts at starting buffer rate
-    // plus one frame's time.
-    if (last_pts_ == 0) duration = 1;
-
-    // Add to the buffer the bits we'd expect from a constant bitrate server.
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-
-    /* Test the buffer model here before subtracting the frame. Do so because
-     * the way the leaky bucket model works in libaom is to allow the buffer to
-     * empty - and then stop showing frames until we've got enough bits to
-     * show one. As noted in comment below (issue 495), this does not currently
-     * apply to key frames. For now exclude key frames in condition below. */
-    const bool key_frame =
-        (pkt->data.frame.flags & AOM_FRAME_IS_KEY) ? true : false;
-    if (!key_frame) {
-      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                          << pkt->data.frame.pts;
-    }
-
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-
-    // Subtract from the buffer the bits associated with a played back frame.
-    bits_in_buffer_model_ -= frame_size_in_bits;
-
-    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits;
-
-    // If first drop not set and we have a drop set it to this time.
-    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
-
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-
-    // We update this so that we can calculate the datarate minus the last
-    // frame encoded in the file.
-    bits_in_last_frame_ = frame_size_in_bits;
-
-    ++frame_number_;
-  }
-
-  virtual void EndPassHook(void) {
-    if (bits_total_) {
-      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
-
-      duration_ = (last_pts_ + 1) * timebase_;
-
-      // Effective file datarate includes the time spent prebuffering.
-      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 /
-                            (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
-
-      file_datarate_ = file_size_in_kb / duration_;
-    }
-  }
-
-  aom_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_;
-  double timebase_;
-  int frame_number_;
-  aom_codec_pts_t first_drop_;
-  int64_t bits_total_;
-  double duration_;
-  double file_datarate_;
-  double effective_datarate_;
-  size_t bits_in_last_frame_;
-  int denoiser_on_;
-  int denoiser_offon_test_;
-  int denoiser_offon_period_;
-};
-
-#if CONFIG_TEMPORAL_DENOISING
-// Check basic datarate targeting, for a single bitrate, but loop over the
-// various denoiser settings.
-TEST_P(DatarateTestLarge, DenoiserLevels) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = AOM_CBR;
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int j = 1; j < 5; ++j) {
-    // Run over the denoiser levels.
-    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
-    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
-    // denoiserOnAggressive, and denoiserOnAdaptive.
-    // For the spatial denoiser (if !CONFIG_TEMPORAL_DENOISING), the level j
-    // refers to the blur thresholds: 20, 40, 60 80.
-    // The j = 0 case (denoiser off) is covered in the tests below.
-    denoiser_on_ = j;
-    cfg_.rc_target_bitrate = 300;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is off
-// and on.
-TEST_P(DatarateTestLarge, DenoiserOffOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = AOM_CBR;
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // The denoiser is off by default.
-  denoiser_on_ = 0;
-  // Set the offon test flag.
-  denoiser_offon_test_ = 1;
-  denoiser_offon_period_ = 100;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
-      << " The datarate for the file missed the target!";
-}
-#endif  // CONFIG_TEMPORAL_DENOISING
-
-TEST_P(DatarateTestLarge, BasicBufferModel) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = AOM_CBR;
-  // 2 pass cbr datarate control has a bug hidden by the small # of
-  // frames selected in this encode. The problem is that even if the buffer is
-  // negative we produce a keyframe on a cutscene. Ignoring datarate
-  // constraints
-  // TODO(jimbankoski): ( Fix when issue
-  // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // There is an issue for low bitrates in real-time mode, where the
-  // effective_datarate slightly overshoots the target bitrate.
-  // This is same the issue as noted about (#495).
-  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
-  // when the issue is resolved.
-  for (int i = 100; i < 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.3)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_max_quantizer = 36;
-  cfg_.rc_end_usage = AOM_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.kf_mode = AOM_KF_DISABLED;
-
-  const int frame_count = 40;
-  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, frame_count);
-
-  // Here we check that the first dropped frame gets earlier and earlier
-  // as the drop frame threshold is increased.
-
-  const int kDropFrameThreshTestStep = 30;
-  aom_codec_pts_t last_drop = frame_count;
-  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-  }
-}
-
-class DatarateTestAV1Large
-    : public ::libaom_test::EncoderTest,
-      public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int> {
- public:
-  DatarateTestAV1Large() : EncoderTest(GET_PARAM(0)) {}
-
- protected:
-  virtual ~DatarateTestAV1Large() {}
-
   virtual void SetUp() {
     InitializeConfig();
     SetMode(GET_PARAM(1));
@@ -317,9 +80,11 @@
       // Update the number of frame drops.
       num_drops_ += static_cast<int>(duration - 1);
       // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
       tot_frame_number_ += static_cast<int>(duration - 1);
     }
 
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
     bits_in_buffer_model_ += static_cast<int64_t>(
         duration * timebase_ * cfg_.rc_target_bitrate * 1000);
 
@@ -340,10 +105,8 @@
 
   virtual void EndPassHook(void) {
     duration_ = (last_pts_ + 1) * timebase_;
-    if (bits_total_) {
-      // Effective file datarate:
-      effective_datarate_ = (bits_total_ / 1000.0) / duration_;
-    }
+    // Effective file datarate:
+    effective_datarate_ = (bits_total_ / 1000.0) / duration_;
   }
 
   aom_codec_pts_t last_pts_;
@@ -362,8 +125,29 @@
   int denoiser_offon_period_;
 };
 
-// Check basic rate targeting,
-TEST_P(DatarateTestAV1Large, BasicRateTargeting) {
+// Check basic rate targeting for VBR mode.
+TEST_P(DatarateTestLarge, BasicRateTargetingVBR) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = AOM_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_, cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_, cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for CBR,
+TEST_P(DatarateTestLarge, BasicRateTargeting) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -386,8 +170,8 @@
   }
 }
 
-// Check basic rate targeting,
-TEST_P(DatarateTestAV1Large, BasicRateTargeting444) {
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestLarge, BasicRateTargeting444) {
   ::libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
 
   cfg_.g_profile = 1;
@@ -419,7 +203,7 @@
 // as the drop frame threshold is increased, and (2) that the total number of
 // frame drops does not decrease as we increase frame drop threshold.
 // Use a lower qp-max to force some frame drops.
-TEST_P(DatarateTestAV1Large, ChangingDropFrameThresh) {
+TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -431,6 +215,9 @@
   cfg_.rc_end_usage = AOM_CBR;
   cfg_.rc_target_bitrate = 200;
   cfg_.g_lag_in_frames = 0;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
 
   ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 140);
@@ -459,9 +246,8 @@
   }
 }
 
-AV1_INSTANTIATE_TEST_CASE(DatarateTestAV1Large,
+AV1_INSTANTIATE_TEST_CASE(DatarateTestLarge,
                           ::testing::Values(::libaom_test::kOnePassGood,
                                             ::libaom_test::kRealTime),
-                          ::testing::Range(2, 7));
-
+                          ::testing::Range(2, 9));
 }  // namespace

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index f5931c6..b4bb14f 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -282,7 +282,6 @@
 }
 
 #if HAVE_SSE2
-
 void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
   aom_highbd_idct16x16_10_add_c(in, out, stride, 10);
 }
@@ -534,10 +533,10 @@
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_AOM_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
 #else
-        const uint32_t diff = dst[j] - src[j];
+        const int diff = dst[j] - src[j];
 #endif  // CONFIG_AOM_HIGHBITDEPTH
         const uint32_t error = diff * diff;
         EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
@@ -590,10 +589,10 @@
 
       for (int j = 0; j < kNumCoeffs; ++j) {
 #if CONFIG_AOM_HIGHBITDEPTH
-        const uint32_t diff =
+        const int diff =
             bit_depth_ == AOM_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
 #else
-        const uint32_t diff = dst[j] - ref[j];
+        const int diff = dst[j] - ref[j];
 #endif  // CONFIG_AOM_HIGHBITDEPTH
         const uint32_t error = diff * diff;
         EXPECT_EQ(0u, error) << "Error: 16x16 IDCT Comparison has error "
@@ -742,6 +741,66 @@
   CompareInvReference(ref_txfm_, thresh_);
 }
 
+class PartialTrans16x16Test : public ::testing::TestWithParam<
+                                  std::tr1::tuple<FdctFunc, aom_bit_depth_t> > {
+ public:
+  virtual ~PartialTrans16x16Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    bit_depth_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  aom_bit_depth_t bit_depth_;
+  FdctFunc fwd_txfm_;
+};
+
+TEST_P(PartialTrans16x16Test, Extremes) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  const int minval = -maxval;
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
+}
+
+TEST_P(PartialTrans16x16Test, Random) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  int sum = 0;
+  for (int i = 0; i < kNumCoeffs; ++i) {
+    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+    input[i] = val;
+    sum += val;
+  }
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
+  EXPECT_EQ(sum >> 1, output[0]);
+}
+
 using std::tr1::make_tuple;
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -774,6 +833,11 @@
         make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
         make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
         make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    C, PartialTrans16x16Test,
+    ::testing::Values(make_tuple(&aom_highbd_fdct16x16_1_c, AOM_BITS_8),
+                      make_tuple(&aom_highbd_fdct16x16_1_c, AOM_BITS_10),
+                      make_tuple(&aom_highbd_fdct16x16_1_c, AOM_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
@@ -782,6 +846,9 @@
         make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 1, AOM_BITS_8),
         make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 2, AOM_BITS_8),
         make_tuple(&av1_fht16x16_c, &av1_iht16x16_256_add_c, 3, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&aom_fdct16x16_1_c,
+                                                     AOM_BITS_8)));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -806,8 +873,17 @@
                                  2, AOM_BITS_8),
                       make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_sse2,
                                  3, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&aom_fdct16x16_1_sse2,
+                                                     AOM_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&aom_fdct16x16_1_avx2,
+                                                     AOM_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16DCT,
@@ -840,6 +916,14 @@
                                  &idct16x16_10_add_12_sse2, 3167, AOM_BITS_12),
                       make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2,
                                  3167, AOM_BITS_12)));
+// TODO(luoyi):
+// For this test case, we should test function: aom_highbd_fdct16x16_1_sse2.
+// However this function is not available yet. if we mistakely test
+// aom_fdct16x16_1_sse2, it could only pass AOM_BITS_8/AOM_BITS_10 but not
+// AOM_BITS_12.
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&aom_fdct16x16_1_sse2,
+                                                     AOM_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -847,6 +931,7 @@
                         ::testing::Values(make_tuple(&aom_fdct16x16_msa,
                                                      &aom_idct16x16_256_add_msa,
                                                      0, AOM_BITS_8)));
+#if !CONFIG_EXT_TX
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans16x16HT,
     ::testing::Values(
@@ -855,5 +940,9 @@
         make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 2, AOM_BITS_8),
         make_tuple(&av1_fht16x16_msa, &av1_iht16x16_256_add_msa, 3,
                    AOM_BITS_8)));
+#endif  // !CONFIG_EXT_TX
+INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
+                        ::testing::Values(make_tuple(&aom_fdct16x16_1_msa,
+                                                     AOM_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 61805f2..cb2fbd5 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc

@@ -290,6 +290,67 @@
   }
 }
 
+class PartialTrans32x32Test
+    : public ::testing::TestWithParam<
+          std::tr1::tuple<FwdTxfmFunc, aom_bit_depth_t> > {
+ public:
+  virtual ~PartialTrans32x32Test() {}
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    bit_depth_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  aom_bit_depth_t bit_depth_;
+  FwdTxfmFunc fwd_txfm_;
+};
+
+TEST_P(PartialTrans32x32Test, Extremes) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  const int minval = -maxval;
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+  EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
+
+  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+  EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
+}
+
+TEST_P(PartialTrans32x32Test, Random) {
+#if CONFIG_AOM_HIGHBITDEPTH
+  const int16_t maxval =
+      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
+#else
+  const int16_t maxval = 255;
+#endif
+  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  int sum = 0;
+  for (int i = 0; i < kNumCoeffs; ++i) {
+    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
+    input[i] = val;
+    sum += val;
+  }
+  output[0] = 0;
+  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
+  EXPECT_EQ(sum >> 3, output[0]);
+}
+
 using std::tr1::make_tuple;
 
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -303,6 +364,11 @@
         make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c, 0, AOM_BITS_8),
         make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c, 1,
                    AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(
+    C, PartialTrans32x32Test,
+    ::testing::Values(make_tuple(&aom_highbd_fdct32x32_1_c, AOM_BITS_8),
+                      make_tuple(&aom_highbd_fdct32x32_1_c, AOM_BITS_10),
+                      make_tuple(&aom_highbd_fdct32x32_1_c, AOM_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
@@ -310,16 +376,19 @@
                                  AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
                                  1, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_1_c,
+                                                     AOM_BITS_8)));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-#if HAVE_NEON_ASM && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
     ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_neon,
                                  0, AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_c,
                                  &aom_idct32x32_1024_add_neon, 1, AOM_BITS_8)));
-#endif  // HAVE_NEON_ASM && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@@ -328,8 +397,17 @@
                                  &aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_sse2,
                                  &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_1_sse2,
+                                                     AOM_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(AVX2, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_1_avx2,
+                                                     AOM_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans32x32Test,
@@ -344,6 +422,9 @@
                    AOM_BITS_8),
         make_tuple(&aom_fdct32x32_rd_sse2, &aom_idct32x32_1024_add_c, 1,
                    AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_1_sse2,
+                                                     AOM_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -355,6 +436,15 @@
                                  &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
 #endif  // HAVE_AVX2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans32x32Test,
+    ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
+                                 &aom_idct32x32_1024_add_sse2, 0, AOM_BITS_8),
+                      make_tuple(&aom_fdct32x32_rd_avx2,
+                                 &aom_idct32x32_1024_add_sse2, 1, AOM_BITS_8)));
+#endif  // HAVE_AVX2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans32x32Test,
@@ -362,5 +452,8 @@
                                  &aom_idct32x32_1024_add_msa, 0, AOM_BITS_8),
                       make_tuple(&aom_fdct32x32_rd_msa,
                                  &aom_idct32x32_1024_add_msa, 1, AOM_BITS_8)));
+INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
+                        ::testing::Values(make_tuple(&aom_fdct32x32_1_msa,
+                                                     AOM_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index bd553c8..7adb9d6 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc

@@ -18,7 +18,8 @@
 
 namespace libaom_test {
 
-const char kAOMName[] = "AOMedia Project AOM";
+const char kVP8Name[] = "WebM Project VP8";
+const char kAV1Name[] = "AOMedia Project AV1 Decoder";
 
 aom_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
                                     aom_codec_stream_info_t *stream_info) {
@@ -40,17 +41,22 @@
   return res_dec;
 }
 
-bool Decoder::IsAOM() const {
+bool Decoder::IsVP8() const {
   const char *codec_name = GetDecoderName();
-  return strncmp(kAOMName, codec_name, sizeof(kAOMName) - 1) == 0;
+  return strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
+}
+
+bool Decoder::IsAV1() const {
+  const char *codec_name = GetDecoderName();
+  return strncmp(kAV1Name, codec_name, sizeof(kAV1Name) - 1) == 0;
 }
 
 void DecoderTest::HandlePeekResult(Decoder *const decoder,
                                    CompressedVideoSource *video,
                                    const aom_codec_err_t res_peek) {
-  const bool is_aom = decoder->IsAOM();
-  if (is_aom) {
-    /* AOM's implementation of PeekStream returns an error if the frame you
+  const bool is_vp8 = decoder->IsVP8();
+  if (is_vp8) {
+    /* Vp8's implementation of PeekStream returns an error if the frame you
      * pass it is not a keyframe, so we only expect AOM_CODEC_OK on the first
      * frame, which must be a keyframe. */
     if (video->frame_number() == 0)

diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index 57c6e7c..b8f8d1a 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h

@@ -99,7 +99,9 @@
     return aom_codec_iface_name(CodecInterface());
   }
 
-  bool IsAOM() const;
+  bool IsVP8() const;
+
+  bool IsAV1() const;
 
   aom_codec_ctx_t *GetDecoder() { return &decoder_; }
 

diff --git a/test/denoiser_sse2_test.cc b/test/denoiser_sse2_test.cc
index bf0bdcd..d51155d 100644
--- a/test/denoiser_sse2_test.cc
+++ b/test/denoiser_sse2_test.cc

@@ -92,8 +92,7 @@
 
 // Test for all block size.
 INSTANTIATE_TEST_CASE_P(SSE2, AV1DenoiserTest,
-                        ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
-                                          BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+                        ::testing::Values(BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
                                           BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
                                           BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
                                           BLOCK_64X64));

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index deb1283..092e669 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -14,6 +14,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./aom_config.h"
+#include "aom_ports/mem.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/encode_test_driver.h"
@@ -36,13 +37,18 @@
 
 #if CONFIG_AV1_ENCODER
     if (CodecInterface() == &aom_codec_av1_cx_algo) {
-      // Default to 1 tile column for AV1.
+// Default to 1 tile column for AV1. With CONFIG_EXT_TILE, the
+// default is already the largest possible tile size
+#if !CONFIG_EXT_TILE
       const int log2_tile_columns = 0;
       res = aom_codec_control_(&encoder_, AV1E_SET_TILE_COLUMNS,
                                log2_tile_columns);
       ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
-    }
+#endif  // !CONFIG_EXT_TILE
+    } else
 #endif
+    {
+    }
   }
 }
 
@@ -115,38 +121,95 @@
   else
     passes_ = 1;
 }
-// The function should return "true" most of the time, therefore no early
-// break-out is implemented within the match checking process.
-static bool compare_img(const aom_image_t *img1, const aom_image_t *img2) {
-  bool match = (img1->fmt == img2->fmt) && (img1->cs == img2->cs) &&
-               (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h);
 
-  const unsigned int width_y = img1->d_w;
-  const unsigned int height_y = img1->d_h;
-  unsigned int i;
-  for (i = 0; i < height_y; ++i)
-    match = (memcmp(img1->planes[AOM_PLANE_Y] + i * img1->stride[AOM_PLANE_Y],
-                    img2->planes[AOM_PLANE_Y] + i * img2->stride[AOM_PLANE_Y],
-                    width_y) == 0) &&
-            match;
-  const unsigned int width_uv = (img1->d_w + 1) >> 1;
-  const unsigned int height_uv = (img1->d_h + 1) >> 1;
-  for (i = 0; i < height_uv; ++i)
-    match = (memcmp(img1->planes[AOM_PLANE_U] + i * img1->stride[AOM_PLANE_U],
-                    img2->planes[AOM_PLANE_U] + i * img2->stride[AOM_PLANE_U],
-                    width_uv) == 0) &&
-            match;
-  for (i = 0; i < height_uv; ++i)
-    match = (memcmp(img1->planes[AOM_PLANE_V] + i * img1->stride[AOM_PLANE_V],
-                    img2->planes[AOM_PLANE_V] + i * img2->stride[AOM_PLANE_V],
-                    width_uv) == 0) &&
-            match;
-  return match;
+static bool compare_plane(const uint8_t *const buf1, const int stride1,
+                          const uint8_t *const buf2, const int stride2,
+                          const int w, const int h, int *const mismatch_row,
+                          int *const mismatch_col, int *const mismatch_pix1,
+                          int *const mismatch_pix2) {
+  int r, c;
+
+  for (r = 0; r < h; ++r) {
+    for (c = 0; c < w; ++c) {
+      const int pix1 = buf1[r * stride1 + c];
+      const int pix2 = buf2[r * stride2 + c];
+
+      if (pix1 != pix2) {
+        if (mismatch_row != NULL) *mismatch_row = r;
+        if (mismatch_col != NULL) *mismatch_col = c;
+        if (mismatch_pix1 != NULL) *mismatch_pix1 = pix1;
+        if (mismatch_pix2 != NULL) *mismatch_pix2 = pix2;
+        return false;
+      }
+    }
+  }
+
+  return true;
 }
 
-void EncoderTest::MismatchHook(const aom_image_t * /*img1*/,
-                               const aom_image_t * /*img2*/) {
-  ASSERT_TRUE(0) << "Encode/Decode mismatch found";
+// The function should return "true" most of the time, therefore no early
+// break-out is implemented within the match checking process.
+static bool compare_img(const aom_image_t *img1, const aom_image_t *img2,
+                        int *const mismatch_row, int *const mismatch_col,
+                        int *const mismatch_plane, int *const mismatch_pix1,
+                        int *const mismatch_pix2) {
+  const unsigned int w_y = img1->d_w;
+  const unsigned int h_y = img1->d_h;
+  const unsigned int w_uv = ROUND_POWER_OF_TWO(w_y, img1->x_chroma_shift);
+  const unsigned int h_uv = ROUND_POWER_OF_TWO(h_y, img1->y_chroma_shift);
+
+  if (img1->fmt != img2->fmt || img1->cs != img2->cs ||
+      img1->d_w != img2->d_w || img1->d_h != img2->d_h) {
+    if (mismatch_row != NULL) *mismatch_row = -1;
+    if (mismatch_col != NULL) *mismatch_col = -1;
+    return false;
+  }
+
+  if (!compare_plane(img1->planes[AOM_PLANE_Y], img1->stride[AOM_PLANE_Y],
+                     img2->planes[AOM_PLANE_Y], img2->stride[AOM_PLANE_Y], w_y,
+                     h_y, mismatch_row, mismatch_col, mismatch_pix1,
+                     mismatch_pix2)) {
+    if (mismatch_plane != NULL) *mismatch_plane = AOM_PLANE_Y;
+    return false;
+  }
+
+  if (!compare_plane(img1->planes[AOM_PLANE_U], img1->stride[AOM_PLANE_U],
+                     img2->planes[AOM_PLANE_U], img2->stride[AOM_PLANE_U], w_uv,
+                     h_uv, mismatch_row, mismatch_col, mismatch_pix1,
+                     mismatch_pix2)) {
+    if (mismatch_plane != NULL) *mismatch_plane = AOM_PLANE_U;
+    return false;
+  }
+
+  if (!compare_plane(img1->planes[AOM_PLANE_V], img1->stride[AOM_PLANE_V],
+                     img2->planes[AOM_PLANE_V], img2->stride[AOM_PLANE_V], w_uv,
+                     h_uv, mismatch_row, mismatch_col, mismatch_pix1,
+                     mismatch_pix2)) {
+    if (mismatch_plane != NULL) *mismatch_plane = AOM_PLANE_U;
+    return false;
+  }
+
+  return true;
+}
+
+void EncoderTest::MismatchHook(const aom_image_t *img_enc,
+                               const aom_image_t *img_dec) {
+  int mismatch_row = 0;
+  int mismatch_col = 0;
+  int mismatch_plane = 0;
+  int mismatch_pix_enc = 0;
+  int mismatch_pix_dec = 0;
+
+  ASSERT_FALSE(compare_img(img_enc, img_dec, &mismatch_row, &mismatch_col,
+                           &mismatch_plane, &mismatch_pix_enc,
+                           &mismatch_pix_dec));
+
+  GTEST_FAIL() << "Encode/Decode mismatch found:" << std::endl
+               << "  pixel value enc/dec: " << mismatch_pix_enc << "/"
+               << mismatch_pix_dec << std::endl
+               << "                plane: " << mismatch_plane << std::endl
+               << "              row/col: " << mismatch_row << "/"
+               << mismatch_col << std::endl;
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
@@ -166,26 +229,36 @@
       cfg_.g_pass = AOM_RC_LAST_PASS;
 
     BeginPassHook(pass);
-    Encoder *const encoder =
-        codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_);
-    ASSERT_TRUE(encoder != NULL);
+    testing::internal::scoped_ptr<Encoder> encoder(
+        codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_));
+    ASSERT_TRUE(encoder.get() != NULL);
 
-    video->Begin();
+    ASSERT_NO_FATAL_FAILURE(video->Begin());
     encoder->InitEncoder(video);
     ASSERT_FALSE(::testing::Test::HasFatalFailure());
 
     unsigned long dec_init_flags = 0;  // NOLINT
     // Use fragment decoder if encoder outputs partitions.
-    // NOTE: fragment decoder and partition encoder are only supported by AOM.
+    // NOTE: fragment decoder and partition encoder are only supported by VP8.
     if (init_flags_ & AOM_CODEC_USE_OUTPUT_PARTITION)
       dec_init_flags |= AOM_CODEC_USE_INPUT_FRAGMENTS;
-    Decoder *const decoder = codec_->CreateDecoder(dec_cfg, dec_init_flags, 0);
+    testing::internal::scoped_ptr<Decoder> decoder(
+        codec_->CreateDecoder(dec_cfg, dec_init_flags, 0));
+#if CONFIG_AV1 && CONFIG_EXT_TILE
+    if (decoder->IsAV1()) {
+      // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
+      // frame is decoded.
+      decoder->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+#endif
+
     bool again;
     for (again = true; again; video->Next()) {
       again = (video->img() != NULL);
 
       PreEncodeFrameHook(video);
-      PreEncodeFrameHook(video, encoder);
+      PreEncodeFrameHook(video, encoder.get());
       encoder->EncodeFrame(video, frame_flags_);
 
       CxDataIterator iter = encoder->GetCxData();
@@ -198,11 +271,11 @@
         switch (pkt->kind) {
           case AOM_CODEC_CX_FRAME_PKT:
             has_cxdata = true;
-            if (decoder && DoDecode()) {
+            if (decoder.get() != NULL && DoDecode()) {
               aom_codec_err_t res_dec = decoder->DecodeFrame(
                   (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
 
-              if (!HandleDecodeResult(res_dec, decoder)) break;
+              if (!HandleDecodeResult(res_dec, decoder.get())) break;
 
               has_dxdata = true;
             }
@@ -220,7 +293,7 @@
       // Flush the decoder when there are no more fragments.
       if ((init_flags_ & AOM_CODEC_USE_OUTPUT_PARTITION) && has_dxdata) {
         const aom_codec_err_t res_dec = decoder->DecodeFrame(NULL, 0);
-        if (!HandleDecodeResult(res_dec, decoder)) break;
+        if (!HandleDecodeResult(res_dec, decoder.get())) break;
       }
 
       if (has_dxdata && has_cxdata) {
@@ -228,7 +301,8 @@
         DxDataIterator dec_iter = decoder->GetDxData();
         const aom_image_t *img_dec = dec_iter.Next();
         if (img_enc && img_dec) {
-          const bool res = compare_img(img_enc, img_dec);
+          const bool res =
+              compare_img(img_enc, img_dec, NULL, NULL, NULL, NULL, NULL);
           if (!res) {  // Mismatch
             MismatchHook(img_enc, img_dec);
           }
@@ -240,9 +314,6 @@
 
     EndPassHook();
 
-    if (decoder) delete decoder;
-    delete encoder;
-
     if (!Continue()) break;
   }
 }

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index c7dfd00..45a080e 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h

@@ -124,11 +124,6 @@
     ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
   }
 
-  void Control(int ctrl_id, struct aom_svc_layer_id *arg) {
-    const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);
-    ASSERT_EQ(AOM_CODEC_OK, res) << EncoderError();
-  }
-
 #if CONFIG_AV1_ENCODER
   void Control(int ctrl_id, aom_active_map_t *arg) {
     const aom_codec_err_t res = aom_codec_control_(&encoder_, ctrl_id, arg);

diff --git a/test/end_to_end_test.cc b/test/end_to_end_test.cc
index 15077b8..a88106f 100644
--- a/test/end_to_end_test.cc
+++ b/test/end_to_end_test.cc

@@ -26,10 +26,21 @@
 const int kBitrate = 500;
 // List of psnr thresholds for speed settings 0-7 and 5 encoding modes
 const double kPsnrThreshold[][5] = {
+// Note:
+// AV1 HBD average PSNR is slightly lower than AV1.
+// We make two cases here to enable the testing and
+// guard picture quality.
+#if CONFIG_AV1_ENCODER && CONFIG_AOM_HIGHBITDEPTH
+  { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 31.0, 36.0, 36.0, 36.0, 36.0 },
+  { 31.0, 35.0, 35.0, 35.0, 35.0 }, { 31.0, 34.0, 34.0, 34.0, 34.0 },
+  { 31.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#else
   { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
   { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
   { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
   { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#endif  // CONFIG_AOM_HIGHBITDEPTH && CONFIG_AV1_ENCODER
 };
 
 typedef struct {
@@ -177,22 +188,8 @@
   delete (video);
 }
 
-#if CONFIG_AOM_HIGHBITDEPTH
-#if CONFIG_AV1_ENCODER
-// TODO(angiebird): many fail in high bitdepth mode.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_AV1, EndToEndTestLarge,
-    ::testing::Combine(
-        ::testing::Values(
-            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
-        ::testing::ValuesIn(kEncodingModeVectors),
-        ::testing::ValuesIn(kTestVectors),
-        ::testing::ValuesIn(kCpuUsedVectors)));
-#endif  // CONFIG_AV1_ENCODER
-#else
 AV1_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
                           ::testing::ValuesIn(kEncodingModeVectors),
                           ::testing::ValuesIn(kTestVectors),
                           ::testing::ValuesIn(kCpuUsedVectors));
-#endif  // CONFIG_AOM_HIGHBITDEPTH
 }  // namespace

diff --git a/test/error_block_test.cc b/test/error_block_test.cc
index db1391b..3de35c7 100644
--- a/test/error_block_test.cc
+++ b/test/error_block_test.cc

@@ -156,25 +156,9 @@
       << "First failed at test case " << first_failure;
 }
 
+#if HAVE_SSE2 || HAVE_AVX
 using std::tr1::make_tuple;
 
-int64_t wrap_av1_highbd_block_error_8bit_c(const tran_low_t *coeff,
-                                           const tran_low_t *dqcoeff,
-                                           intptr_t block_size, int64_t *ssz,
-                                           int bps) {
-  assert(bps == 8);
-  return av1_highbd_block_error_8bit_c(coeff, dqcoeff, block_size, ssz);
-}
-
-#if HAVE_SSE2
-int64_t wrap_av1_highbd_block_error_8bit_sse2(const tran_low_t *coeff,
-                                              const tran_low_t *dqcoeff,
-                                              intptr_t block_size, int64_t *ssz,
-                                              int bps) {
-  assert(bps == 8);
-  return av1_highbd_block_error_8bit_sse2(coeff, dqcoeff, block_size, ssz);
-}
-
 INSTANTIATE_TEST_CASE_P(
     SSE2, ErrorBlockTest,
     ::testing::Values(make_tuple(&av1_highbd_block_error_sse2,
@@ -182,25 +166,8 @@
                       make_tuple(&av1_highbd_block_error_sse2,
                                  &av1_highbd_block_error_c, AOM_BITS_12),
                       make_tuple(&av1_highbd_block_error_sse2,
-                                 &av1_highbd_block_error_c, AOM_BITS_8),
-                      make_tuple(&wrap_av1_highbd_block_error_8bit_sse2,
-                                 &wrap_av1_highbd_block_error_8bit_c,
-                                 AOM_BITS_8)));
+                                 &av1_highbd_block_error_c, AOM_BITS_8)));
 #endif  // HAVE_SSE2
 
-#if HAVE_AVX
-int64_t wrap_av1_highbd_block_error_8bit_avx(const tran_low_t *coeff,
-                                             const tran_low_t *dqcoeff,
-                                             intptr_t block_size, int64_t *ssz,
-                                             int bps) {
-  assert(bps == 8);
-  return av1_highbd_block_error_8bit_avx(coeff, dqcoeff, block_size, ssz);
-}
-
-INSTANTIATE_TEST_CASE_P(AVX, ErrorBlockTest,
-                        ::testing::Values(make_tuple(
-                            &wrap_av1_highbd_block_error_8bit_avx,
-                            &wrap_av1_highbd_block_error_8bit_c, AOM_BITS_8)));
-#endif  // HAVE_AVX
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 }  // namespace

diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index a487c52..07b6039 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc

@@ -22,7 +22,7 @@
 
 class ErrorResilienceTestLarge
     : public ::libaom_test::EncoderTest,
-      public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, bool> {
+      public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
  protected:
   ErrorResilienceTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(0.0), nframes_(0), mismatch_psnr_(0.0),
@@ -58,7 +58,6 @@
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video) {
     frame_flags_ &=
         ~(AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF);
-    // For temporal layer case.
     if (droppable_nframes_ > 0 &&
         (cfg_.g_pass == AOM_RC_LAST_PASS || cfg_.g_pass == AOM_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < droppable_nframes_; ++i) {
@@ -102,6 +101,7 @@
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
     // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+    ::libaom_test::EncoderTest::MismatchHook(img1, img2);
   }
 
   void SetErrorFrames(int num, unsigned int *list) {
@@ -231,57 +231,5 @@
 #endif
 }
 
-class ErrorResilienceTestLargeCodecControls
-    : public ::libaom_test::EncoderTest,
-      public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
- protected:
-  ErrorResilienceTestLargeCodecControls()
-      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)) {
-    Reset();
-  }
-
-  virtual ~ErrorResilienceTestLargeCodecControls() {}
-
-  void Reset() {
-    last_pts_ = 0;
-    tot_frame_number_ = 0;
-    bits_total_ = 0;
-    duration_ = 0.0;
-  }
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(encoding_mode_);
-  }
-
-  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    aom_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-    if (duration > 1) {
-      // Update counter for total number of frames (#frames input to encoder).
-      // Needed for setting the proper layer_id below.
-      tot_frame_number_ += static_cast<int>(duration - 1);
-    }
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-    // Update the total encoded bits. For temporal layers, update the cumulative
-    // encoded bits per layer.
-    bits_total_ += frame_size_in_bits;
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-    ++tot_frame_number_;
-  }
-
-  virtual void EndPassHook(void) { duration_ = (last_pts_ + 1) * timebase_; }
-
- private:
-  libaom_test::TestMode encoding_mode_;
-  aom_codec_pts_t last_pts_;
-  double timebase_;
-  int64_t bits_total_;
-  double duration_;
-  int tot_frame_number_;
-};
-
-AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES,
-                          ::testing::Values(false));
+AV1_INSTANTIATE_TEST_CASE(ErrorResilienceTestLarge, ONE_PASS_TEST_MODES);
 }  // namespace

diff --git a/test/ethread_test.cc b/test/ethread_test.cc
index cc0a9d4..2685f87 100644
--- a/test/ethread_test.cc
+++ b/test/ethread_test.cc

@@ -24,13 +24,25 @@
       public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int> {
  protected:
   AVxEncoderThreadTest()
-      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false), tiles_(2),
+      : EncoderTest(GET_PARAM(0)), encoder_initialized_(false),
         encoding_mode_(GET_PARAM(1)), set_cpu_used_(GET_PARAM(2)) {
     init_flags_ = AOM_CODEC_USE_PSNR;
+    aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+#if CONFIG_AV1 && CONFIG_EXT_TILE
+    if (decoder_->IsAV1()) {
+      decoder_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+#endif
 
-    md5_.clear();
+    size_enc_.clear();
+    md5_dec_.clear();
+    md5_enc_.clear();
   }
-  virtual ~AVxEncoderThreadTest() {}
+  virtual ~AVxEncoderThreadTest() { delete decoder_; }
 
   virtual void SetUp() {
     InitializeConfig();
@@ -40,7 +52,7 @@
       cfg_.g_lag_in_frames = 3;
       cfg_.rc_end_usage = AOM_VBR;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
-      cfg_.rc_2pass_vbr_minsection_pct = 2000;
+      cfg_.rc_2pass_vbr_maxsection_pct = 2000;
     } else {
       cfg_.g_lag_in_frames = 0;
       cfg_.rc_end_usage = AOM_CBR;
@@ -57,14 +69,27 @@
   virtual void PreEncodeFrameHook(::libaom_test::VideoSource * /*video*/,
                                   ::libaom_test::Encoder *encoder) {
     if (!encoder_initialized_) {
-      // Encode 4 column tiles.
-      encoder->Control(AV1E_SET_TILE_COLUMNS, tiles_);
+#if CONFIG_AV1 && CONFIG_EXT_TILE
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+      if (codec_ == &libaom_test::kAV1) {
+        // TODO(geza): Start using multiple tile rows when the multi-threaded
+        // encoder can handle them
+        encoder->Control(AV1E_SET_TILE_ROWS, 32);
+      } else {
+        encoder->Control(AV1E_SET_TILE_ROWS, 0);
+      }
+#else
+      // Encode 4 tile columns.
+      encoder->Control(AV1E_SET_TILE_COLUMNS, 2);
+      encoder->Control(AV1E_SET_TILE_ROWS, 0);
+#endif  // CONFIG_AV1 && CONFIG_EXT_TILE
       encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
       if (encoding_mode_ != ::libaom_test::kRealTime) {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
         encoder->Control(AOME_SET_ARNR_TYPE, 3);
+        encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 0);
       } else {
         encoder->Control(AOME_SET_ENABLEAUTOALTREF, 0);
         encoder->Control(AV1E_SET_AQ_MODE, 3);
@@ -73,66 +98,93 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const aom_image_t &img,
-                                     aom_codec_pts_t /*pts*/) {
-    ::libaom_test::MD5 md5_res;
-    md5_res.Add(&img);
-    md5_.push_back(md5_res.Get());
+  virtual void FramePktHook(const aom_codec_cx_pkt_t *pkt) {
+    size_enc_.push_back(pkt->data.frame.sz);
+
+    ::libaom_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t *>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+
+    const aom_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t *>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != AOM_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(AOM_CODEC_OK, res);
+    }
+    const aom_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libaom_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_dec_.push_back(md5_res.Get());
+    }
   }
 
-  virtual bool HandleDecodeResult(const aom_codec_err_t res,
-                                  libaom_test::Decoder * /*decoder*/) {
-    if (res != AOM_CODEC_OK) {
-      EXPECT_EQ(AOM_CODEC_OK, res);
-      return false;
-    }
-    return true;
+  void DoTest() {
+    ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 18);
+    cfg_.rc_target_bitrate = 1000;
+
+    // Encode using single thread.
+    cfg_.g_threads = 1;
+    init_flags_ = AOM_CODEC_USE_PSNR;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> single_thr_size_enc;
+    std::vector<std::string> single_thr_md5_enc;
+    std::vector<std::string> single_thr_md5_dec;
+    single_thr_size_enc = size_enc_;
+    single_thr_md5_enc = md5_enc_;
+    single_thr_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Encode using multiple threads.
+    cfg_.g_threads = 4;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> multi_thr_size_enc;
+    std::vector<std::string> multi_thr_md5_enc;
+    std::vector<std::string> multi_thr_md5_dec;
+    multi_thr_size_enc = size_enc_;
+    multi_thr_md5_enc = md5_enc_;
+    multi_thr_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
+    ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
+    ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
   }
 
   bool encoder_initialized_;
-  int tiles_;
   ::libaom_test::TestMode encoding_mode_;
   int set_cpu_used_;
-  std::vector<std::string> md5_;
+  ::libaom_test::Decoder *decoder_;
+  std::vector<size_t> size_enc_;
+  std::vector<std::string> md5_enc_;
+  std::vector<std::string> md5_dec_;
 };
 
-TEST_P(AVxEncoderThreadTest, EncoderResultTest) {
-  std::vector<std::string> single_thr_md5, multi_thr_md5;
+TEST_P(AVxEncoderThreadTest, EncoderResultTest) { DoTest(); }
 
-  ::libaom_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
+class AVxEncoderThreadTestLarge : public AVxEncoderThreadTest {};
 
-  cfg_.rc_target_bitrate = 1000;
+TEST_P(AVxEncoderThreadTestLarge, EncoderResultTest) { DoTest(); }
 
-  // Encode using single thread.
-  cfg_.g_threads = 1;
-  init_flags_ = AOM_CODEC_USE_PSNR;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  single_thr_md5 = md5_;
-  md5_.clear();
-
-  // Encode using multiple threads.
-  cfg_.g_threads = 4;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  multi_thr_md5 = md5_;
-  md5_.clear();
-
-  // Compare to check if two vectors are equal.
-  ASSERT_EQ(single_thr_md5, multi_thr_md5);
-}
 #if CONFIG_EC_ADAPT
 // TODO(thdavies): EC_ADAPT does not support tiles
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_AV1, AVxEncoderThreadTest,
-    ::testing::Combine(
-        ::testing::Values(
-            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
-        ::testing::Values(::libaom_test::kTwoPassGood,
-                          ::libaom_test::kOnePassGood),
-        ::testing::Range(1, 3)));
+
 #else
 AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTest,
                           ::testing::Values(::libaom_test::kTwoPassGood,
                                             ::libaom_test::kOnePassGood),
-                          ::testing::Range(1, 3));
+                          ::testing::Range(3, 9));
+
+AV1_INSTANTIATE_TEST_CASE(AVxEncoderThreadTestLarge,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 3));
 #endif
 }  // namespace

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 66fc635..f7b0932 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -20,6 +20,7 @@
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/transform_test_base.h"
 #include "test/util.h"
 #include "av1/common/entropy.h"
 #include "aom/aom_codec.h"
@@ -29,16 +30,15 @@
 using libaom_test::ACMRandom;
 
 namespace {
-const int kNumCoeffs = 16;
 typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
+using libaom_test::FhtFunc;
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, aom_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t> Ht4x4Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, aom_bit_depth_t, int>
+    Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht4x4Param;
 
 void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
                  int /*tx_type*/) {
@@ -90,191 +90,7 @@
 #endif  // HAVE_SSE2
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-class Trans4x4TestBase {
- public:
-  virtual ~Trans4x4TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_AOM_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == AOM_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_AOM_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_AOM_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_AOM_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(AOM_BITS_8, bit_depth_);
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
-
-    EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(input_extreme_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_AOM_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == AOM_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_AOM_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
-      if (bit_depth_ == AOM_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_AOM_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_AOM_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
-            << "Error: 4x4 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  int tx_type_;
-  FhtFunc fwd_txfm_ref;
-  aom_bit_depth_t bit_depth_;
-  int mask_;
-};
-
-class Trans4x4DCT : public Trans4x4TestBase,
+class Trans4x4DCT : public libaom_test::TransformTestBase,
                     public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4DCT() {}
@@ -284,9 +100,11 @@
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
     pitch_ = 4;
+    height_ = 4;
     fwd_txfm_ref = fdct4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
@@ -310,7 +128,7 @@
 
 TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
-class Trans4x4HT : public Trans4x4TestBase,
+class Trans4x4HT : public libaom_test::TransformTestBase,
                    public ::testing::TestWithParam<Ht4x4Param> {
  public:
   virtual ~Trans4x4HT() {}
@@ -320,9 +138,11 @@
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
     pitch_ = 4;
+    height_ = 4;
     fwd_txfm_ref = fht4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
@@ -347,7 +167,7 @@
 
 TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
-class Trans4x4WHT : public Trans4x4TestBase,
+class Trans4x4WHT : public libaom_test::TransformTestBase,
                     public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4WHT() {}
@@ -357,9 +177,11 @@
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
     pitch_ = 4;
+    height_ = 4;
     fwd_txfm_ref = fwht4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
@@ -388,125 +210,135 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_10, 0, AOM_BITS_10),
-        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_12, 0, AOM_BITS_12),
-        make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c, 0, AOM_BITS_8)));
+        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_10, 0, AOM_BITS_10, 16),
+        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_12, 0, AOM_BITS_12, 16),
+        make_tuple(&aom_fdct4x4_c, &aom_idct4x4_16_add_c, 0, AOM_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_c,
                                                      &aom_idct4x4_16_add_c, 0,
-                                                     AOM_BITS_8)));
+                                                     AOM_BITS_8, 16)));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #if CONFIG_AOM_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 0, AOM_BITS_10),
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 1, AOM_BITS_10),
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 2, AOM_BITS_10),
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 3, AOM_BITS_10),
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 0, AOM_BITS_12),
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 1, AOM_BITS_12),
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 2, AOM_BITS_12),
-        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 3, AOM_BITS_12),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 3, AOM_BITS_8)));
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 0, AOM_BITS_10, 16),
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 1, AOM_BITS_10, 16),
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 2, AOM_BITS_10, 16),
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_10, 3, AOM_BITS_10, 16),
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 0, AOM_BITS_12, 16),
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 1, AOM_BITS_12, 16),
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 2, AOM_BITS_12, 16),
+        make_tuple(&av1_highbd_fht4x4_c, &iht4x4_12, 3, AOM_BITS_12, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 0, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 1, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 2, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 3, AOM_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 3, AOM_BITS_8)));
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 0, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 1, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 2, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_c, 3, AOM_BITS_8, 16)));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #if CONFIG_AOM_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, 0, AOM_BITS_10),
-        make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, 0, AOM_BITS_12),
-        make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, 0, AOM_BITS_8)));
+        make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, 0, AOM_BITS_10, 16),
+        make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, 0, AOM_BITS_12, 16),
+        make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, 0, AOM_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
                         ::testing::Values(make_tuple(&av1_fwht4x4_c,
                                                      &aom_iwht4x4_16_add_c, 0,
-                                                     AOM_BITS_8)));
+                                                     AOM_BITS_8, 16)));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_c,
                                                      &aom_idct4x4_16_add_neon,
-                                                     0, AOM_BITS_8)));
+                                                     0, AOM_BITS_8, 16)));
 #endif  // HAVE_NEON_ASM && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_NEON && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 0, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 1, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 2, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 3, AOM_BITS_8)));
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 0, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 1, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 2, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_c, &av1_iht4x4_16_add_neon, 3, AOM_BITS_8, 16)));
 #endif  // HAVE_NEON && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&av1_fwht4x4_sse2, &aom_iwht4x4_16_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_sse2, 0, AOM_BITS_8)));
+    ::testing::Values(make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_c, 0,
+                                 AOM_BITS_8, 16),
+                      make_tuple(&av1_fwht4x4_c, &aom_iwht4x4_16_add_sse2, 0,
+                                 AOM_BITS_8, 16)));
 #endif
 
 #if HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_sse2,
                                                      &aom_idct4x4_16_add_sse2,
-                                                     0, AOM_BITS_8)));
+                                                     0, AOM_BITS_8, 16)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 0, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 1, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 2, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 3, AOM_BITS_8)));
+    ::testing::Values(make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 0,
+                                 AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 1,
+                                 AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 2,
+                                 AOM_BITS_8, 16),
+                      make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_sse2, 3,
+                                 AOM_BITS_8, 16)));
 #endif  // HAVE_SSE2 && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, AOM_BITS_10),
-        make_tuple(&aom_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, AOM_BITS_10),
-        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, AOM_BITS_12),
-        make_tuple(&aom_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, AOM_BITS_12),
-        make_tuple(&aom_fdct4x4_sse2, &aom_idct4x4_16_add_c, 0, AOM_BITS_8)));
+        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, AOM_BITS_10, 16),
+        make_tuple(&aom_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, AOM_BITS_10,
+                   16),
+        make_tuple(&aom_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, AOM_BITS_12, 16),
+        make_tuple(&aom_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, AOM_BITS_12,
+                   16),
+        make_tuple(&aom_fdct4x4_sse2, &aom_idct4x4_16_add_c, 0, AOM_BITS_8,
+                   16)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 0, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 1, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 2, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 3, AOM_BITS_8)));
+        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 0, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 1, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 2, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_sse2, &av1_iht4x4_16_add_c, 3, AOM_BITS_8, 16)));
 #endif  // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
                         ::testing::Values(make_tuple(&aom_fdct4x4_msa,
                                                      &aom_idct4x4_16_add_msa, 0,
-                                                     AOM_BITS_8)));
+                                                     AOM_BITS_8, 16)));
+#if !CONFIG_EXT_TX
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 0, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 1, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 2, AOM_BITS_8),
-        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 3, AOM_BITS_8)));
+        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 0, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 1, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 2, AOM_BITS_8, 16),
+        make_tuple(&av1_fht4x4_msa, &av1_iht4x4_16_add_msa, 3, AOM_BITS_8,
+                   16)));
+#endif  // !CONFIG_EXT_TX
 #endif  // HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index b58f9b2..bbfb7f1 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -728,8 +728,7 @@
         make_tuple(&idct8x8_12, &idct8x8_64_add_12_sse2, 6225, AOM_BITS_12)));
 #endif  // HAVE_SSE2 && CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
-#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_AOM_HIGHBITDEPTH && \
-    !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
                                                      &aom_idct8x8_64_add_ssse3,
@@ -741,6 +740,7 @@
                         ::testing::Values(make_tuple(&aom_fdct8x8_msa,
                                                      &aom_idct8x8_64_add_msa, 0,
                                                      AOM_BITS_8)));
+#if !CONFIG_EXT_TX
 INSTANTIATE_TEST_CASE_P(
     MSA, FwdTrans8x8HT,
     ::testing::Values(
@@ -748,5 +748,6 @@
         make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa, 1, AOM_BITS_8),
         make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa, 2, AOM_BITS_8),
         make_tuple(&av1_fht8x8_msa, &av1_iht8x8_64_add_msa, 3, AOM_BITS_8)));
+#endif  // !CONFIG_EXT_TX
 #endif  // HAVE_MSA && !CONFIG_AOM_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace

diff --git a/test/fht32x32_test.cc b/test/fht32x32_test.cc
new file mode 100644
index 0000000..8545b2c
--- /dev/null
+++ b/test/fht32x32_test.cc

@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "aom_ports/mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libaom_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, aom_bit_depth_t, int> Ht32x32Param;
+
+void fht32x32_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  av1_fht32x32_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt32x32Param;
+
+void highbd_fht32x32_ref(const int16_t *in, int32_t *out, int stride,
+                         int tx_type, int bd) {
+  av1_fwd_txfm2d_32x32_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#if HAVE_AVX2
+void dummy_inv_txfm(const tran_low_t *in, uint8_t *out, int stride,
+                    int tx_type) {
+  (void)in;
+  (void)out;
+  (void)stride;
+  (void)tx_type;
+}
+#endif
+
+class AV1Trans32x32HT : public libaom_test::TransformTestBase,
+                        public ::testing::TestWithParam<Ht32x32Param> {
+ public:
+  virtual ~AV1Trans32x32HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_ = GET_PARAM(2);
+    pitch_ = 32;
+    height_ = 32;
+    fwd_txfm_ref = fht32x32_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(AV1Trans32x32HT, CoeffCheck) { RunCoeffCheck(); }
+// TODO(luoyi): As CONFIG_AOM_HIGHBITDEPTH = 1, our AVX2 implementation of
+// av1_fht32x32 does not support tran_low_t (int32_t) as intermediate result.
+// Therefore MemCheck test, tx_type=1,2,...,8 can't pass the test yet.
+#if !CONFIG_AOM_HIGHBITDEPTH
+TEST_P(AV1Trans32x32HT, MemCheck) { RunMemCheck(); }
+#endif
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HighbdTrans32x32HT
+    : public ::testing::TestWithParam<HighbdHt32x32Param> {
+ public:
+  virtual ~AV1HighbdTrans32x32HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht32x32_ref;
+    tx_type_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 1024;
+
+    input_ = reinterpret_cast<int16_t *>(
+        aom_memalign(32, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        aom_memalign(32, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    aom_free(input_);
+    aom_free(output_);
+    aom_free(output_ref_);
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void AV1HighbdTrans32x32HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 32;
+  const int num_tests = 1000;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(
+        fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_));
+
+    for (j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(AV1HighbdTrans32x32HT, HighbdCoeffCheck) { RunBitexactCheck(); }
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_AVX2
+const Ht32x32Param kArrayHt32x32Param_avx2[] = {
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 0, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 1, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 2, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 3, AOM_BITS_8, 1024),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 4, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 5, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 6, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 7, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 8, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 10, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 11, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 12, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 13, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 14, AOM_BITS_8, 1024),
+  make_tuple(&av1_fht32x32_avx2, dummy_inv_txfm, 15, AOM_BITS_8, 1024)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(AVX2, AV1Trans32x32HT,
+                        ::testing::ValuesIn(kArrayHt32x32Param_avx2));
+#endif  // HAVE_AVX2
+}  // namespace

diff --git a/test/filterintra_predictors_test.cc b/test/filterintra_predictors_test.cc
new file mode 100644
index 0000000..66605f6
--- /dev/null
+++ b/test/filterintra_predictors_test.cc

@@ -0,0 +1,330 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+
+namespace {
+
+using std::tr1::tuple;
+using libaom_test::ACMRandom;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
+                          const uint8_t *above, const uint8_t *left);
+
+// Note:
+//  Test parameter list:
+//  Reference predictor, optimized predictor, prediction mode, block size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, int> PredParams;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef void (*HbdPredictor)(uint16_t *dst, ptrdiff_t stride, int bs,
+                             const uint16_t *above, const uint16_t *left,
+                             int bd);
+
+// Note:
+//  Test parameter list:
+//  Reference predictor, optimized predictor, prediction mode, block size,
+//  bit depth
+//
+typedef tuple<HbdPredictor, HbdPredictor, int> HbdPredFuncMode;
+typedef tuple<HbdPredFuncMode, int, int> HbdPredParams;
+#endif
+
+const int MaxBlkSize = 32;
+
+// By default, disable speed test
+#define PREDICTORS_SPEED_TEST (0)
+
+#if PREDICTORS_SPEED_TEST
+const int MaxTestNum = 100000;
+#else
+const int MaxTestNum = 100;
+#endif
+
+class AV1FilterIntraPredOptimzTest
+    : public ::testing::TestWithParam<PredParams> {
+ public:
+  virtual ~AV1FilterIntraPredOptimzTest() {}
+  virtual void SetUp() {
+    PredFuncMode funcMode = GET_PARAM(0);
+    predFuncRef_ = std::tr1::get<0>(funcMode);
+    predFunc_ = std::tr1::get<1>(funcMode);
+    mode_ = std::tr1::get<2>(funcMode);
+    blockSize_ = GET_PARAM(1);
+
+    alloc_ = new uint8_t[3 * MaxBlkSize + 2];
+    predRef_ = new uint8_t[MaxBlkSize * MaxBlkSize];
+    pred_ = new uint8_t[MaxBlkSize * MaxBlkSize];
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    delete[] predRef_;
+    delete[] pred_;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunTest() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxBlkSize + 1;
+    while (tstIndex < MaxTestNum) {
+      PrepareBuffer();
+      predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
+      ASM_REGISTER_STATE_CHECK(
+          predFunc_(pred_, stride, blockSize_, &above[1], left));
+      DiffPred(tstIndex);
+      tstIndex += 1;
+    }
+  }
+
+  void RunSpeedTestC() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxBlkSize + 1;
+    PrepareBuffer();
+    while (tstIndex < MaxTestNum) {
+      predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
+      tstIndex += 1;
+    }
+  }
+
+  void RunSpeedTestSSE() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxBlkSize + 1;
+    PrepareBuffer();
+    while (tstIndex < MaxTestNum) {
+      predFunc_(predRef_, stride, blockSize_, &above[1], left);
+      tstIndex += 1;
+    }
+  }
+
+ private:
+  void PrepareBuffer() const {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int i = 0;
+    while (i < (3 * MaxBlkSize + 2)) {
+      alloc_[i] = rnd.Rand8();
+      i += 1;
+    }
+  }
+
+  void DiffPred(int testNum) const {
+    int i = 0;
+    while (i < blockSize_ * blockSize_) {
+      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+                                       << "Block size: " << blockSize_ << " "
+                                       << "Test number: " << testNum;
+      i += 1;
+    }
+  }
+
+  Predictor predFunc_;
+  Predictor predFuncRef_;
+  int mode_;
+  int blockSize_;
+  uint8_t *alloc_;
+  uint8_t *pred_;
+  uint8_t *predRef_;
+};
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HbdFilterIntraPredOptimzTest
+    : public ::testing::TestWithParam<HbdPredParams> {
+ public:
+  virtual ~AV1HbdFilterIntraPredOptimzTest() {}
+  virtual void SetUp() {
+    HbdPredFuncMode funcMode = GET_PARAM(0);
+    predFuncRef_ = std::tr1::get<0>(funcMode);
+    predFunc_ = std::tr1::get<1>(funcMode);
+    mode_ = std::tr1::get<2>(funcMode);
+    blockSize_ = GET_PARAM(1);
+    bd_ = GET_PARAM(2);
+
+    alloc_ = new uint16_t[3 * MaxBlkSize + 2];
+    predRef_ = new uint16_t[MaxBlkSize * MaxBlkSize];
+    pred_ = new uint16_t[MaxBlkSize * MaxBlkSize];
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    delete[] predRef_;
+    delete[] pred_;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunTest() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint16_t *left = alloc_;
+    uint16_t *above = alloc_ + MaxBlkSize + 1;
+    while (tstIndex < MaxTestNum) {
+      PrepareBuffer();
+      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
+      ASM_REGISTER_STATE_CHECK(
+          predFunc_(pred_, stride, blockSize_, &above[1], left, bd_));
+      DiffPred(tstIndex);
+      tstIndex += 1;
+    }
+  }
+
+  void RunSpeedTestC() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint16_t *left = alloc_;
+    uint16_t *above = alloc_ + MaxBlkSize + 1;
+    PrepareBuffer();
+    while (tstIndex < MaxTestNum) {
+      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
+      tstIndex += 1;
+    }
+  }
+
+  void RunSpeedTestSSE() const {
+    int tstIndex = 0;
+    int stride = blockSize_;
+    uint16_t *left = alloc_;
+    uint16_t *above = alloc_ + MaxBlkSize + 1;
+    PrepareBuffer();
+    while (tstIndex < MaxTestNum) {
+      predFunc_(predRef_, stride, blockSize_, &above[1], left, bd_);
+      tstIndex += 1;
+    }
+  }
+
+ private:
+  void PrepareBuffer() const {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int i = 0;
+    while (i < (3 * MaxBlkSize + 2)) {
+      alloc_[i] = rnd.Rand16() & ((1 << bd_) - 1);
+      i += 1;
+    }
+  }
+
+  void DiffPred(int testNum) const {
+    int i = 0;
+    while (i < blockSize_ * blockSize_) {
+      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+                                       << "Block size: " << blockSize_ << " "
+                                       << "Bit depth: " << bd_ << " "
+                                       << "Test number: " << testNum;
+      i += 1;
+    }
+  }
+
+  HbdPredictor predFunc_;
+  HbdPredictor predFuncRef_;
+  int mode_;
+  int blockSize_;
+  int bd_;
+  uint16_t *alloc_;
+  uint16_t *pred_;
+  uint16_t *predRef_;
+};
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+TEST_P(AV1FilterIntraPredOptimzTest, BitExactCheck) { RunTest(); }
+
+#if PREDICTORS_SPEED_TEST
+TEST_P(AV1FilterIntraPredOptimzTest, SpeedCheckC) { RunSpeedTestC(); }
+
+TEST_P(AV1FilterIntraPredOptimzTest, SpeedCheckSSE) { RunSpeedTestSSE(); }
+#endif
+
+#if CONFIG_AOM_HIGHBITDEPTH
+TEST_P(AV1HbdFilterIntraPredOptimzTest, BitExactCheck) { RunTest(); }
+
+#if PREDICTORS_SPEED_TEST
+TEST_P(AV1HbdFilterIntraPredOptimzTest, SpeedCheckC) { RunSpeedTestC(); }
+
+TEST_P(AV1HbdFilterIntraPredOptimzTest, SpeedCheckSSE) { RunSpeedTestSSE(); }
+#endif  // PREDICTORS_SPEED_TEST
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+const PredFuncMode kPredFuncMdArray[] = {
+  make_tuple(av1_dc_filter_predictor_c, av1_dc_filter_predictor_sse4_1,
+             DC_PRED),
+  make_tuple(av1_v_filter_predictor_c, av1_v_filter_predictor_sse4_1, V_PRED),
+  make_tuple(av1_h_filter_predictor_c, av1_h_filter_predictor_sse4_1, H_PRED),
+  make_tuple(av1_d45_filter_predictor_c, av1_d45_filter_predictor_sse4_1,
+             D45_PRED),
+  make_tuple(av1_d135_filter_predictor_c, av1_d135_filter_predictor_sse4_1,
+             D135_PRED),
+  make_tuple(av1_d117_filter_predictor_c, av1_d117_filter_predictor_sse4_1,
+             D117_PRED),
+  make_tuple(av1_d153_filter_predictor_c, av1_d153_filter_predictor_sse4_1,
+             D153_PRED),
+  make_tuple(av1_d207_filter_predictor_c, av1_d207_filter_predictor_sse4_1,
+             D207_PRED),
+  make_tuple(av1_d63_filter_predictor_c, av1_d63_filter_predictor_sse4_1,
+             D63_PRED),
+  make_tuple(av1_tm_filter_predictor_c, av1_tm_filter_predictor_sse4_1,
+             TM_PRED),
+};
+
+const int kBlkSize[] = { 4, 8, 16, 32 };
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1FilterIntraPredOptimzTest,
+    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
+                       ::testing::ValuesIn(kBlkSize)));
+
+#if CONFIG_AOM_HIGHBITDEPTH
+const HbdPredFuncMode kHbdPredFuncMdArray[] = {
+  make_tuple(av1_highbd_dc_filter_predictor_c,
+             av1_highbd_dc_filter_predictor_sse4_1, DC_PRED),
+  make_tuple(av1_highbd_v_filter_predictor_c,
+             av1_highbd_v_filter_predictor_sse4_1, V_PRED),
+  make_tuple(av1_highbd_h_filter_predictor_c,
+             av1_highbd_h_filter_predictor_sse4_1, H_PRED),
+  make_tuple(av1_highbd_d45_filter_predictor_c,
+             av1_highbd_d45_filter_predictor_sse4_1, D45_PRED),
+  make_tuple(av1_highbd_d135_filter_predictor_c,
+             av1_highbd_d135_filter_predictor_sse4_1, D135_PRED),
+  make_tuple(av1_highbd_d117_filter_predictor_c,
+             av1_highbd_d117_filter_predictor_sse4_1, D117_PRED),
+  make_tuple(av1_highbd_d153_filter_predictor_c,
+             av1_highbd_d153_filter_predictor_sse4_1, D153_PRED),
+  make_tuple(av1_highbd_d207_filter_predictor_c,
+             av1_highbd_d207_filter_predictor_sse4_1, D207_PRED),
+  make_tuple(av1_highbd_d63_filter_predictor_c,
+             av1_highbd_d63_filter_predictor_sse4_1, D63_PRED),
+  make_tuple(av1_highbd_tm_filter_predictor_c,
+             av1_highbd_tm_filter_predictor_sse4_1, TM_PRED),
+};
+
+const int kBd[] = { 10, 12 };
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1HbdFilterIntraPredOptimzTest,
+    ::testing::Combine(::testing::ValuesIn(kHbdPredFuncMdArray),
+                       ::testing::ValuesIn(kBlkSize),
+                       ::testing::ValuesIn(kBd)));
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+}  // namespace

diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
new file mode 100644
index 0000000..bc8a406
--- /dev/null
+++ b/test/hadamard_test.cc

@@ -0,0 +1,220 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <algorithm>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libaom_test::ACMRandom;
+
+typedef void (*HadamardFunc)(const int16_t *a, int a_stride, int16_t *b);
+
+void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
+  int16_t b[8];
+  for (int i = 0; i < 8; i += 2) {
+    b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride];
+    b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride];
+  }
+  int16_t c[8];
+  for (int i = 0; i < 8; i += 4) {
+    c[i + 0] = b[i + 0] + b[i + 2];
+    c[i + 1] = b[i + 1] + b[i + 3];
+    c[i + 2] = b[i + 0] - b[i + 2];
+    c[i + 3] = b[i + 1] - b[i + 3];
+  }
+  out[0] = c[0] + c[4];
+  out[7] = c[1] + c[5];
+  out[3] = c[2] + c[6];
+  out[4] = c[3] + c[7];
+  out[2] = c[0] - c[4];
+  out[6] = c[1] - c[5];
+  out[1] = c[2] - c[6];
+  out[5] = c[3] - c[7];
+}
+
+void reference_hadamard8x8(const int16_t *a, int a_stride, int16_t *b) {
+  int16_t buf[64];
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(a + i, a_stride, buf + i * 8);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    hadamard_loop(buf + i, 8, b + i * 8);
+  }
+}
+
+void reference_hadamard16x16(const int16_t *a, int a_stride, int16_t *b) {
+  /* The source is a 16x16 block. The destination is rearranged to 8x32.
+   * Input is 9 bit. */
+  reference_hadamard8x8(a + 0 + 0 * a_stride, a_stride, b + 0);
+  reference_hadamard8x8(a + 8 + 0 * a_stride, a_stride, b + 64);
+  reference_hadamard8x8(a + 0 + 8 * a_stride, a_stride, b + 128);
+  reference_hadamard8x8(a + 8 + 8 * a_stride, a_stride, b + 192);
+
+  /* Overlay the 8x8 blocks and combine. */
+  for (int i = 0; i < 64; ++i) {
+    /* 8x8 steps the range up to 15 bits. */
+    const int16_t a0 = b[0];
+    const int16_t a1 = b[64];
+    const int16_t a2 = b[128];
+    const int16_t a3 = b[192];
+
+    /* Prevent the result from escaping int16_t. */
+    const int16_t b0 = (a0 + a1) >> 1;
+    const int16_t b1 = (a0 - a1) >> 1;
+    const int16_t b2 = (a2 + a3) >> 1;
+    const int16_t b3 = (a2 - a3) >> 1;
+
+    /* Store a 16 bit value. */
+    b[0] = b0 + b2;
+    b[64] = b1 + b3;
+    b[128] = b0 - b2;
+    b[192] = b1 - b3;
+
+    ++b;
+  }
+}
+
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  HadamardFunc h_func_;
+  ACMRandom rnd_;
+};
+
+class Hadamard8x8Test : public HadamardTestBase {};
+
+TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[64]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard8x8(a, 8, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 64);
+  std::sort(b_ref, b_ref + 64);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard8x8Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[64]);
+  int16_t b_ref[64];
+  for (int i = 0; i < 64 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard8x8(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 64);
+    std::sort(b_ref, b_ref + 64);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
+                        ::testing::Values(&aom_hadamard_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
+                        ::testing::Values(&aom_hadamard_8x8_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
+                        ::testing::Values(&aom_hadamard_8x8_ssse3));
+#endif  // HAVE_SSSE3 && ARCH_X86_64
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
+                        ::testing::Values(&aom_hadamard_8x8_neon));
+#endif  // HAVE_NEON
+
+class Hadamard16x16Test : public HadamardTestBase {};
+
+TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+  memset(b, 0, sizeof(b));
+  memset(b_ref, 0, sizeof(b_ref));
+
+  reference_hadamard16x16(a, 16, b_ref);
+  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
+
+  // The order of the output is not important. Sort before checking.
+  std::sort(b, b + 16 * 16);
+  std::sort(b_ref, b_ref + 16 * 16);
+  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+}
+
+TEST_P(Hadamard16x16Test, VaryStride) {
+  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
+  DECLARE_ALIGNED(16, int16_t, b[16 * 16]);
+  int16_t b_ref[16 * 16];
+  for (int i = 0; i < 16 * 16 * 8; ++i) {
+    a[i] = rnd_.Rand9Signed();
+  }
+
+  for (int i = 8; i < 64; i += 8) {
+    memset(b, 0, sizeof(b));
+    memset(b_ref, 0, sizeof(b_ref));
+
+    reference_hadamard16x16(a, i, b_ref);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+    // The order of the output is not important. Sort before checking.
+    std::sort(b, b + 16 * 16);
+    std::sort(b_ref, b_ref + 16 * 16);
+    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
+                        ::testing::Values(&aom_hadamard_16x16_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
+                        ::testing::Values(&aom_hadamard_16x16_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
+                        ::testing::Values(&aom_hadamard_16x16_neon));
+#endif  // HAVE_NEON
+}  // namespace

diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
index c6f6bbd..455e180 100644
--- a/test/hbd_metrics_test.cc
+++ b/test/hbd_metrics_test.cc

@@ -13,15 +13,15 @@
 #include <stdlib.h>
 #include <new>
 
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
 #include "./aom_config.h"
 #include "aom_dsp/psnr.h"
 #include "aom_dsp/ssim.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/msvc.h"
 #include "aom_scale/yv12config.h"
-#include "test/acm_random.h"
-#include "test/util.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
 
 using libaom_test::ACMRandom;
 
@@ -97,7 +97,7 @@
   void RunAccuracyCheck() {
     const int width = 1920;
     const int height = 1080;
-    int i = 0;
+    size_t i = 0;
     const uint8_t kPixFiller = 128;
     YV12_BUFFER_CONFIG lbd_src, lbd_dst;
     YV12_BUFFER_CONFIG hbd_src, hbd_dst;
@@ -157,13 +157,13 @@
     lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
     hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
     EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
     aom_free_frame_buffer(&lbd_src);
     aom_free_frame_buffer(&lbd_dst);
     aom_free_frame_buffer(&hbd_src);
     aom_free_frame_buffer(&hbd_dst);
-
-    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
   }
+
   int input_bit_depth_;
   int bit_depth_;
   double threshold_;

diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index bffdf2d..ff43464 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc

@@ -71,7 +71,8 @@
     for (int j = 0; j < 64; ++j) input[j] = src[j] - dst[j];
 
     reference_dct_2d(input, output_r);
-    for (int j = 0; j < 64; ++j) coeff[j] = round(output_r[j]);
+    for (int j = 0; j < 64; ++j)
+      coeff[j] = static_cast<tran_low_t>(round(output_r[j]));
     aom_idct8x8_64_add_c(coeff, dst, 8);
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];

diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 78f41cc..12eabcf 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc

@@ -29,45 +29,42 @@
 
 const int count_test_block = 100000;
 
-// Base class for AV1 intra prediction tests.
-class AV1IntraPredBase {
+typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride,
+                          const uint16_t *above, const uint16_t *left, int bps);
+
+struct IntraPredFunc {
+  IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL,
+                int block_size_value = 0, int bit_depth_value = 0)
+      : pred_fn(pred), ref_fn(ref), block_size(block_size_value),
+        bit_depth(bit_depth_value) {}
+
+  IntraPred pred_fn;
+  IntraPred ref_fn;
+  int block_size;
+  int bit_depth;
+};
+
+class AV1IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
  public:
-  virtual ~AV1IntraPredBase() { libaom_test::ClearSystemState(); }
-
- protected:
-  virtual void Predict() = 0;
-
-  void CheckPrediction(int test_case_number, int *error_count) const {
-    // For each pixel ensure that the calculated value is the same as reference.
-    for (int y = 0; y < block_size_; y++) {
-      for (int x = 0; x < block_size_; x++) {
-        *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
-        if (*error_count == 1) {
-          ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
-              << " Failed on Test Case Number " << test_case_number;
-        }
-      }
-    }
-  }
-
   void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst,
                uint16_t *ref_dst) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int block_size = params_.block_size;
+    above_row_ = above_data + 16;
     left_col_ = left_col;
     dst_ = dst;
     ref_dst_ = ref_dst;
-    above_row_ = above_data + 16;
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
       // Fill edges with random data, try first with saturated values.
-      for (int x = -1; x <= block_size_ * 2; x++) {
+      for (int x = -1; x <= block_size * 2; x++) {
         if (i == 0) {
           above_row_[x] = mask_;
         } else {
           above_row_[x] = rnd.Rand16() & mask_;
         }
       }
-      for (int y = 0; y < block_size_; y++) {
+      for (int y = 0; y < block_size; y++) {
         if (i == 0) {
           left_col_[y] = mask_;
         } else {
@@ -80,41 +77,42 @@
     ASSERT_EQ(0, error_count);
   }
 
-  int block_size_;
+ protected:
+  virtual void SetUp() {
+    params_ = GetParam();
+    stride_ = params_.block_size * 3;
+    mask_ = (1 << params_.bit_depth) - 1;
+  }
+
+  void Predict() {
+    const int bit_depth = params_.bit_depth;
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+  }
+
+  void CheckPrediction(int test_case_number, int *error_count) const {
+    // For each pixel ensure that the calculated value is the same as reference.
+    const int block_size = params_.block_size;
+    for (int y = 0; y < block_size; y++) {
+      for (int x = 0; x < block_size; x++) {
+        *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_];
+        if (*error_count == 1) {
+          ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_])
+              << " Failed on Test Case Number " << test_case_number;
+        }
+      }
+    }
+  }
+
   uint16_t *above_row_;
   uint16_t *left_col_;
   uint16_t *dst_;
   uint16_t *ref_dst_;
   ptrdiff_t stride_;
   int mask_;
-};
 
-typedef void (*intra_pred_fn_t)(uint16_t *dst, ptrdiff_t stride,
-                                const uint16_t *above, const uint16_t *left,
-                                int bps);
-typedef std::tr1::tuple<intra_pred_fn_t, intra_pred_fn_t, int, int>
-    intra_pred_params_t;
-class AV1IntraPredTest : public AV1IntraPredBase,
-                         public ::testing::TestWithParam<intra_pred_params_t> {
-  virtual void SetUp() {
-    pred_fn_ = GET_PARAM(0);
-    ref_fn_ = GET_PARAM(1);
-    block_size_ = GET_PARAM(2);
-    bit_depth_ = GET_PARAM(3);
-    stride_ = block_size_ * 3;
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void Predict() {
-    const uint16_t *const_above_row = above_row_;
-    const uint16_t *const_left_col = left_col_;
-    ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_);
-    ASM_REGISTER_STATE_CHECK(
-        pred_fn_(dst_, stride_, const_above_row, const_left_col, bit_depth_));
-  }
-  intra_pred_fn_t pred_fn_;
-  intra_pred_fn_t ref_fn_;
-  int bit_depth_;
+  IntraPredFunc params_;
 };
 
 TEST_P(AV1IntraPredTest, IntraPredTests) {
@@ -126,90 +124,107 @@
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
-using std::tr1::make_tuple;
-
 #if HAVE_SSE2
 #if CONFIG_AOM_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     SSE2_TO_C_8, AV1IntraPredTest,
-    ::testing::Values(make_tuple(&aom_highbd_dc_predictor_32x32_sse2,
-                                 &aom_highbd_dc_predictor_32x32_c, 32, 8),
-                      make_tuple(&aom_highbd_tm_predictor_16x16_sse2,
-                                 &aom_highbd_tm_predictor_16x16_c, 16, 8),
-                      make_tuple(&aom_highbd_tm_predictor_32x32_sse2,
-                                 &aom_highbd_tm_predictor_32x32_c, 32, 8),
-                      make_tuple(&aom_highbd_dc_predictor_4x4_sse2,
-                                 &aom_highbd_dc_predictor_4x4_c, 4, 8),
-                      make_tuple(&aom_highbd_dc_predictor_8x8_sse2,
-                                 &aom_highbd_dc_predictor_8x8_c, 8, 8),
-                      make_tuple(&aom_highbd_dc_predictor_16x16_sse2,
-                                 &aom_highbd_dc_predictor_16x16_c, 16, 8),
-                      make_tuple(&aom_highbd_v_predictor_4x4_sse2,
-                                 &aom_highbd_v_predictor_4x4_c, 4, 8),
-                      make_tuple(&aom_highbd_v_predictor_8x8_sse2,
-                                 &aom_highbd_v_predictor_8x8_c, 8, 8),
-                      make_tuple(&aom_highbd_v_predictor_16x16_sse2,
-                                 &aom_highbd_v_predictor_16x16_c, 16, 8),
-                      make_tuple(&aom_highbd_v_predictor_32x32_sse2,
-                                 &aom_highbd_v_predictor_32x32_c, 32, 8),
-                      make_tuple(&aom_highbd_tm_predictor_4x4_sse2,
-                                 &aom_highbd_tm_predictor_4x4_c, 4, 8),
-                      make_tuple(&aom_highbd_tm_predictor_8x8_sse2,
-                                 &aom_highbd_tm_predictor_8x8_c, 8, 8)));
+    ::testing::Values(IntraPredFunc(&aom_highbd_dc_predictor_32x32_sse2,
+                                    &aom_highbd_dc_predictor_32x32_c, 32, 8),
+#if !CONFIG_ALT_INTRA
+                      IntraPredFunc(&aom_highbd_tm_predictor_16x16_sse2,
+                                    &aom_highbd_tm_predictor_16x16_c, 16, 8),
+                      IntraPredFunc(&aom_highbd_tm_predictor_32x32_sse2,
+                                    &aom_highbd_tm_predictor_32x32_c, 32, 8),
+#endif  // !CONFIG_ALT_INTRA
+
+                      IntraPredFunc(&aom_highbd_dc_predictor_4x4_sse2,
+                                    &aom_highbd_dc_predictor_4x4_c, 4, 8),
+                      IntraPredFunc(&aom_highbd_dc_predictor_8x8_sse2,
+                                    &aom_highbd_dc_predictor_8x8_c, 8, 8),
+                      IntraPredFunc(&aom_highbd_dc_predictor_16x16_sse2,
+                                    &aom_highbd_dc_predictor_16x16_c, 16, 8),
+                      IntraPredFunc(&aom_highbd_v_predictor_4x4_sse2,
+                                    &aom_highbd_v_predictor_4x4_c, 4, 8),
+                      IntraPredFunc(&aom_highbd_v_predictor_8x8_sse2,
+                                    &aom_highbd_v_predictor_8x8_c, 8, 8),
+                      IntraPredFunc(&aom_highbd_v_predictor_16x16_sse2,
+                                    &aom_highbd_v_predictor_16x16_c, 16, 8),
+                      IntraPredFunc(&aom_highbd_v_predictor_32x32_sse2,
+                                    &aom_highbd_v_predictor_32x32_c, 32, 8)
+#if !CONFIG_ALT_INTRA
+                          ,
+                      IntraPredFunc(&aom_highbd_tm_predictor_4x4_sse2,
+                                    &aom_highbd_tm_predictor_4x4_c, 4, 8),
+                      IntraPredFunc(&aom_highbd_tm_predictor_8x8_sse2,
+                                    &aom_highbd_tm_predictor_8x8_c, 8, 8)
+#endif  // !CONFIG_ALT_INTRA
+                          ));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2_TO_C_10, AV1IntraPredTest,
-    ::testing::Values(make_tuple(&aom_highbd_dc_predictor_32x32_sse2,
-                                 &aom_highbd_dc_predictor_32x32_c, 32, 10),
-                      make_tuple(&aom_highbd_tm_predictor_16x16_sse2,
-                                 &aom_highbd_tm_predictor_16x16_c, 16, 10),
-                      make_tuple(&aom_highbd_tm_predictor_32x32_sse2,
-                                 &aom_highbd_tm_predictor_32x32_c, 32, 10),
-                      make_tuple(&aom_highbd_dc_predictor_4x4_sse2,
-                                 &aom_highbd_dc_predictor_4x4_c, 4, 10),
-                      make_tuple(&aom_highbd_dc_predictor_8x8_sse2,
-                                 &aom_highbd_dc_predictor_8x8_c, 8, 10),
-                      make_tuple(&aom_highbd_dc_predictor_16x16_sse2,
-                                 &aom_highbd_dc_predictor_16x16_c, 16, 10),
-                      make_tuple(&aom_highbd_v_predictor_4x4_sse2,
-                                 &aom_highbd_v_predictor_4x4_c, 4, 10),
-                      make_tuple(&aom_highbd_v_predictor_8x8_sse2,
-                                 &aom_highbd_v_predictor_8x8_c, 8, 10),
-                      make_tuple(&aom_highbd_v_predictor_16x16_sse2,
-                                 &aom_highbd_v_predictor_16x16_c, 16, 10),
-                      make_tuple(&aom_highbd_v_predictor_32x32_sse2,
-                                 &aom_highbd_v_predictor_32x32_c, 32, 10),
-                      make_tuple(&aom_highbd_tm_predictor_4x4_sse2,
-                                 &aom_highbd_tm_predictor_4x4_c, 4, 10),
-                      make_tuple(&aom_highbd_tm_predictor_8x8_sse2,
-                                 &aom_highbd_tm_predictor_8x8_c, 8, 10)));
+    ::testing::Values(IntraPredFunc(&aom_highbd_dc_predictor_32x32_sse2,
+                                    &aom_highbd_dc_predictor_32x32_c, 32, 10),
+#if !CONFIG_ALT_INTRA
+                      IntraPredFunc(&aom_highbd_tm_predictor_16x16_sse2,
+                                    &aom_highbd_tm_predictor_16x16_c, 16, 10),
+                      IntraPredFunc(&aom_highbd_tm_predictor_32x32_sse2,
+                                    &aom_highbd_tm_predictor_32x32_c, 32, 10),
+#endif  // !CONFIG_ALT_INTRA
+                      IntraPredFunc(&aom_highbd_dc_predictor_4x4_sse2,
+                                    &aom_highbd_dc_predictor_4x4_c, 4, 10),
+                      IntraPredFunc(&aom_highbd_dc_predictor_8x8_sse2,
+                                    &aom_highbd_dc_predictor_8x8_c, 8, 10),
+                      IntraPredFunc(&aom_highbd_dc_predictor_16x16_sse2,
+                                    &aom_highbd_dc_predictor_16x16_c, 16, 10),
+                      IntraPredFunc(&aom_highbd_v_predictor_4x4_sse2,
+                                    &aom_highbd_v_predictor_4x4_c, 4, 10),
+                      IntraPredFunc(&aom_highbd_v_predictor_8x8_sse2,
+                                    &aom_highbd_v_predictor_8x8_c, 8, 10),
+                      IntraPredFunc(&aom_highbd_v_predictor_16x16_sse2,
+                                    &aom_highbd_v_predictor_16x16_c, 16, 10),
+                      IntraPredFunc(&aom_highbd_v_predictor_32x32_sse2,
+                                    &aom_highbd_v_predictor_32x32_c, 32, 10)
+#if !CONFIG_ALT_INTRA
+                          ,
+                      IntraPredFunc(&aom_highbd_tm_predictor_4x4_sse2,
+                                    &aom_highbd_tm_predictor_4x4_c, 4, 10),
+                      IntraPredFunc(&aom_highbd_tm_predictor_8x8_sse2,
+                                    &aom_highbd_tm_predictor_8x8_c, 8, 10)
+#endif  // !CONFIG_ALT_INTRA
+                          ));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2_TO_C_12, AV1IntraPredTest,
-    ::testing::Values(make_tuple(&aom_highbd_dc_predictor_32x32_sse2,
-                                 &aom_highbd_dc_predictor_32x32_c, 32, 12),
-                      make_tuple(&aom_highbd_tm_predictor_16x16_sse2,
-                                 &aom_highbd_tm_predictor_16x16_c, 16, 12),
-                      make_tuple(&aom_highbd_tm_predictor_32x32_sse2,
-                                 &aom_highbd_tm_predictor_32x32_c, 32, 12),
-                      make_tuple(&aom_highbd_dc_predictor_4x4_sse2,
-                                 &aom_highbd_dc_predictor_4x4_c, 4, 12),
-                      make_tuple(&aom_highbd_dc_predictor_8x8_sse2,
-                                 &aom_highbd_dc_predictor_8x8_c, 8, 12),
-                      make_tuple(&aom_highbd_dc_predictor_16x16_sse2,
-                                 &aom_highbd_dc_predictor_16x16_c, 16, 12),
-                      make_tuple(&aom_highbd_v_predictor_4x4_sse2,
-                                 &aom_highbd_v_predictor_4x4_c, 4, 12),
-                      make_tuple(&aom_highbd_v_predictor_8x8_sse2,
-                                 &aom_highbd_v_predictor_8x8_c, 8, 12),
-                      make_tuple(&aom_highbd_v_predictor_16x16_sse2,
-                                 &aom_highbd_v_predictor_16x16_c, 16, 12),
-                      make_tuple(&aom_highbd_v_predictor_32x32_sse2,
-                                 &aom_highbd_v_predictor_32x32_c, 32, 12),
-                      make_tuple(&aom_highbd_tm_predictor_4x4_sse2,
-                                 &aom_highbd_tm_predictor_4x4_c, 4, 12),
-                      make_tuple(&aom_highbd_tm_predictor_8x8_sse2,
-                                 &aom_highbd_tm_predictor_8x8_c, 8, 12)));
+    ::testing::Values(IntraPredFunc(&aom_highbd_dc_predictor_32x32_sse2,
+                                    &aom_highbd_dc_predictor_32x32_c, 32, 12),
+#if !CONFIG_ALT_INTRA
+                      IntraPredFunc(&aom_highbd_tm_predictor_16x16_sse2,
+                                    &aom_highbd_tm_predictor_16x16_c, 16, 12),
+                      IntraPredFunc(&aom_highbd_tm_predictor_32x32_sse2,
+                                    &aom_highbd_tm_predictor_32x32_c, 32, 12),
+#endif  // !CONFIG_ALT_INTRA
+                      IntraPredFunc(&aom_highbd_dc_predictor_4x4_sse2,
+                                    &aom_highbd_dc_predictor_4x4_c, 4, 12),
+                      IntraPredFunc(&aom_highbd_dc_predictor_8x8_sse2,
+                                    &aom_highbd_dc_predictor_8x8_c, 8, 12),
+                      IntraPredFunc(&aom_highbd_dc_predictor_16x16_sse2,
+                                    &aom_highbd_dc_predictor_16x16_c, 16, 12),
+                      IntraPredFunc(&aom_highbd_v_predictor_4x4_sse2,
+                                    &aom_highbd_v_predictor_4x4_c, 4, 12),
+                      IntraPredFunc(&aom_highbd_v_predictor_8x8_sse2,
+                                    &aom_highbd_v_predictor_8x8_c, 8, 12),
+                      IntraPredFunc(&aom_highbd_v_predictor_16x16_sse2,
+                                    &aom_highbd_v_predictor_16x16_c, 16, 12),
+                      IntraPredFunc(&aom_highbd_v_predictor_32x32_sse2,
+                                    &aom_highbd_v_predictor_32x32_c, 32, 12)
+#if !CONFIG_ALT_INTRA
+                          ,
+                      IntraPredFunc(&aom_highbd_tm_predictor_4x4_sse2,
+                                    &aom_highbd_tm_predictor_4x4_c, 4, 12),
+                      IntraPredFunc(&aom_highbd_tm_predictor_8x8_sse2,
+                                    &aom_highbd_tm_predictor_8x8_c, 8, 12)
+#endif  // !CONFIG_ALT_INTRA
+                          ));
 
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 #endif  // HAVE_SSE2

diff --git a/test/level_test.cc b/test/level_test.cc
new file mode 100644
index 0000000..3c46903
--- /dev/null
+++ b/test/level_test.cc

@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+class LevelTest
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int> {
+ protected:
+  LevelTest()
+      : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+        cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0),
+        level_(0) {}
+  virtual ~LevelTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+    if (encoding_mode_ != ::libaom_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = AOM_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = AOM_CBR;
+    }
+    cfg_.rc_2pass_vbr_minsection_pct = 5;
+    cfg_.rc_2pass_vbr_maxsection_pct = 2000;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+                                  ::libaom_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+      encoder->Control(AV1E_SET_TARGET_LEVEL, target_level_);
+      encoder->Control(AV1E_SET_MIN_GF_INTERVAL, min_gf_internal_);
+      if (encoding_mode_ != ::libaom_test::kRealTime) {
+        encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+        encoder->Control(AOME_SET_ARNR_TYPE, 3);
+      }
+    }
+    encoder->Control(AV1E_GET_LEVEL, &level_);
+    ASSERT_LE(level_, 51);
+    ASSERT_GE(level_, 0);
+  }
+
+  ::libaom_test::TestMode encoding_mode_;
+  int cpu_used_;
+  int min_gf_internal_;
+  int target_level_;
+  int level_;
+};
+
+// Test for keeping level stats only
+TEST_P(LevelTest, TestTargetLevel0) {
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       40);
+  target_level_ = 0;
+  min_gf_internal_ = 4;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(11, level_);
+
+  cfg_.rc_target_bitrate = 1600;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_EQ(20, level_);
+}
+
+// Test for level control being turned off
+TEST_P(LevelTest, TestTargetLevel255) {
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
+                                       30);
+  target_level_ = 255;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(LevelTest, TestTargetLevelApi) {
+  ::libaom_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
+  static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
+  aom_codec_ctx_t enc;
+  aom_codec_enc_cfg_t cfg;
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0));
+  for (int level = 0; level <= 256; ++level) {
+    if (level == 10 || level == 11 || level == 20 || level == 21 ||
+        level == 30 || level == 31 || level == 40 || level == 41 ||
+        level == 50 || level == 51 || level == 52 || level == 60 ||
+        level == 61 || level == 62 || level == 0 || level == 255)
+      EXPECT_EQ(AOM_CODEC_OK,
+                aom_codec_control(&enc, AV1E_SET_TARGET_LEVEL, level));
+    else
+      EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+                aom_codec_control(&enc, AV1E_SET_TARGET_LEVEL, level));
+  }
+  EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+AV1_INSTANTIATE_TEST_CASE(LevelTest,
+                          ::testing::Values(::libaom_test::kTwoPassGood,
+                                            ::libaom_test::kOnePassGood),
+                          ::testing::Range(0, 9));
+}  // namespace

diff --git a/test/lossless_test.cc b/test/lossless_test.cc
index bc492b4..9fd5bea 100644
--- a/test/lossless_test.cc
+++ b/test/lossless_test.cc

@@ -22,15 +22,15 @@
 
 const int kMaxPsnr = 100;
 
-class LosslessTest
+class LosslessTestLarge
     : public ::libaom_test::EncoderTest,
       public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
  protected:
-  LosslessTest()
+  LosslessTestLarge()
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)) {}
 
-  virtual ~LosslessTest() {}
+  virtual ~LosslessTestLarge() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -65,7 +65,7 @@
   libaom_test::TestMode encoding_mode_;
 };
 
-TEST_P(LosslessTest, TestLossLessEncoding) {
+TEST_P(LosslessTestLarge, TestLossLessEncoding) {
   const aom_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
@@ -77,14 +77,14 @@
 
   // intentionally changed the dimension for better testing coverage
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     timebase.den, timebase.num, 0, 10);
+                                     timebase.den, timebase.num, 0, 5);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double psnr_lossless = GetMinPsnr();
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-TEST_P(LosslessTest, TestLossLessEncoding444) {
-  libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10);
+TEST_P(LosslessTestLarge, TestLossLessEncoding444) {
+  libaom_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 5);
 
   cfg_.g_profile = 1;
   cfg_.g_timebase = video.timebase();
@@ -100,7 +100,7 @@
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-TEST_P(LosslessTest, TestLossLessEncodingCtrl) {
+TEST_P(LosslessTestLarge, TestLossLessEncodingCtrl) {
   const aom_rational timebase = { 33333333, 1000000000 };
   cfg_.g_timebase = timebase;
   cfg_.rc_target_bitrate = 2000;
@@ -113,13 +113,13 @@
   init_flags_ = AOM_CODEC_USE_PSNR;
 
   libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                     timebase.den, timebase.num, 0, 10);
+                                     timebase.den, timebase.num, 0, 5);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double psnr_lossless = GetMinPsnr();
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
 
-AV1_INSTANTIATE_TEST_CASE(LosslessTest,
+AV1_INSTANTIATE_TEST_CASE(LosslessTestLarge,
                           ::testing::Values(::libaom_test::kOnePassGood,
                                             ::libaom_test::kTwoPassGood));
 }  // namespace

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index 5d814f4..318df19 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc

@@ -160,7 +160,7 @@
         loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-    for (int j = 0; j < kNumCoeffs; ++j) {
+    for (j = 0; j < kNumCoeffs; ++j) {
       err_count += ref_s[j] != s[j];
     }
     if (err_count && !err_count_total) {
@@ -324,7 +324,7 @@
     ASM_REGISTER_STATE_CHECK(loopfilter_op_(s + 8 + p * 8, p, blimit0, limit0,
                                             thresh0, blimit1, limit1, thresh1));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
-    for (int j = 0; j < kNumCoeffs; ++j) {
+    for (j = 0; j < kNumCoeffs; ++j) {
       err_count += ref_s[j] != s[j];
     }
     if (err_count && !err_count_total) {

diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
new file mode 100644
index 0000000..68ff0be
--- /dev/null
+++ b/test/masked_sad_test.cc

@@ -0,0 +1,206 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 500;
+
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride);
+typedef std::tr1::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+
+class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
+ public:
+  virtual ~MaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedSADFunc maskedSAD_op_;
+  MaskedSADFunc ref_maskedSAD_op_;
+};
+
+TEST_P(MaskedSADTest, OperationCheck) {
+  unsigned int ref_ret, ret;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+      assert(msk_ptr[j] <= 64);
+    }
+
+    ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+                                msk_ptr, msk_stride);
+    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride, ref_ptr,
+                                                 ref_stride, msk_ptr,
+                                                 msk_stride));
+    if (ret != ref_ret) {
+      err_count++;
+      if (first_failure == -1) first_failure = i;
+    }
+  }
+  EXPECT_EQ(0, err_count)
+      << "Error: Masked SAD Test, C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure;
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            const uint8_t *m, int m_stride);
+typedef std::tr1::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+    HighbdMaskedSADParam;
+
+class HighbdMaskedSADTest
+    : public ::testing::TestWithParam<HighbdMaskedSADParam> {
+ public:
+  virtual ~HighbdMaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_ = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  HighbdMaskedSADFunc maskedSAD_op_;
+  HighbdMaskedSADFunc ref_maskedSAD_op_;
+};
+
+TEST_P(HighbdMaskedSADTest, OperationCheck) {
+  unsigned int ref_ret, ret;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand16() & 0xfff;
+      ref_ptr[j] = rnd.Rand16() & 0xfff;
+      msk_ptr[j] = ((rnd.Rand8() & 0x7f) > 64) ? rnd.Rand8() & 0x3f : 64;
+    }
+
+    ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                                msk_ptr, msk_stride);
+    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride, ref8_ptr,
+                                                 ref_stride, msk_ptr,
+                                                 msk_stride));
+    if (ret != ref_ret) {
+      err_count++;
+      if (first_failure == -1) first_failure = i;
+    }
+  }
+  EXPECT_EQ(0, err_count)
+      << "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure;
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_C_COMPARE, MaskedSADTest,
+    ::testing::Values(
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_masked_sad128x128_ssse3, &aom_masked_sad128x128_c),
+        make_tuple(&aom_masked_sad128x64_ssse3, &aom_masked_sad128x64_c),
+        make_tuple(&aom_masked_sad64x128_ssse3, &aom_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_masked_sad64x64_ssse3, &aom_masked_sad64x64_c),
+        make_tuple(&aom_masked_sad64x32_ssse3, &aom_masked_sad64x32_c),
+        make_tuple(&aom_masked_sad32x64_ssse3, &aom_masked_sad32x64_c),
+        make_tuple(&aom_masked_sad32x32_ssse3, &aom_masked_sad32x32_c),
+        make_tuple(&aom_masked_sad32x16_ssse3, &aom_masked_sad32x16_c),
+        make_tuple(&aom_masked_sad16x32_ssse3, &aom_masked_sad16x32_c),
+        make_tuple(&aom_masked_sad16x16_ssse3, &aom_masked_sad16x16_c),
+        make_tuple(&aom_masked_sad16x8_ssse3, &aom_masked_sad16x8_c),
+        make_tuple(&aom_masked_sad8x16_ssse3, &aom_masked_sad8x16_c),
+        make_tuple(&aom_masked_sad8x8_ssse3, &aom_masked_sad8x8_c),
+        make_tuple(&aom_masked_sad8x4_ssse3, &aom_masked_sad8x4_c),
+        make_tuple(&aom_masked_sad4x8_ssse3, &aom_masked_sad4x8_c),
+        make_tuple(&aom_masked_sad4x4_ssse3, &aom_masked_sad4x4_c)));
+#if CONFIG_AOM_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(SSSE3_C_COMPARE, HighbdMaskedSADTest,
+                        ::testing::Values(
+#if CONFIG_EXT_PARTITION
+                            make_tuple(&aom_highbd_masked_sad128x128_ssse3,
+                                       &aom_highbd_masked_sad128x128_c),
+                            make_tuple(&aom_highbd_masked_sad128x64_ssse3,
+                                       &aom_highbd_masked_sad128x64_c),
+                            make_tuple(&aom_highbd_masked_sad64x128_ssse3,
+                                       &aom_highbd_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+                            make_tuple(&aom_highbd_masked_sad64x64_ssse3,
+                                       &aom_highbd_masked_sad64x64_c),
+                            make_tuple(&aom_highbd_masked_sad64x32_ssse3,
+                                       &aom_highbd_masked_sad64x32_c),
+                            make_tuple(&aom_highbd_masked_sad32x64_ssse3,
+                                       &aom_highbd_masked_sad32x64_c),
+                            make_tuple(&aom_highbd_masked_sad32x32_ssse3,
+                                       &aom_highbd_masked_sad32x32_c),
+                            make_tuple(&aom_highbd_masked_sad32x16_ssse3,
+                                       &aom_highbd_masked_sad32x16_c),
+                            make_tuple(&aom_highbd_masked_sad16x32_ssse3,
+                                       &aom_highbd_masked_sad16x32_c),
+                            make_tuple(&aom_highbd_masked_sad16x16_ssse3,
+                                       &aom_highbd_masked_sad16x16_c),
+                            make_tuple(&aom_highbd_masked_sad16x8_ssse3,
+                                       &aom_highbd_masked_sad16x8_c),
+                            make_tuple(&aom_highbd_masked_sad8x16_ssse3,
+                                       &aom_highbd_masked_sad8x16_c),
+                            make_tuple(&aom_highbd_masked_sad8x8_ssse3,
+                                       &aom_highbd_masked_sad8x8_c),
+                            make_tuple(&aom_highbd_masked_sad8x4_ssse3,
+                                       &aom_highbd_masked_sad8x4_c),
+                            make_tuple(&aom_highbd_masked_sad4x8_ssse3,
+                                       &aom_highbd_masked_sad4x8_c),
+                            make_tuple(&aom_highbd_masked_sad4x4_ssse3,
+                                       &aom_highbd_masked_sad4x4_c)));
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
+}  // namespace

diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
new file mode 100644
index 0000000..c7726bf
--- /dev/null
+++ b/test/masked_variance_test.cc

@@ -0,0 +1,789 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_mem/aom_mem.h"
+
+using libaom_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 500;
+
+typedef unsigned int (*MaskedVarianceFunc)(const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           const uint8_t *m, int m_stride,
+                                           unsigned int *sse);
+
+typedef std::tr1::tuple<MaskedVarianceFunc, MaskedVarianceFunc>
+    MaskedVarianceParam;
+
+class MaskedVarianceTest
+    : public ::testing::TestWithParam<MaskedVarianceParam> {
+ public:
+  virtual ~MaskedVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedVarianceFunc opt_func_;
+  MaskedVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+
+    ref_ret = ref_func_(src_ptr, src_stride, ref_ptr, ref_stride, msk_ptr,
+                        msk_stride, &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride, ref_ptr,
+                                                 ref_stride, msk_ptr,
+                                                 msk_stride, &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1) first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test OperationCheck,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < 8; ++i) {
+    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
+    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
+    memset(msk_ptr, (i & 0x4) ? 64 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
+
+    ref_ret = ref_func_(src_ptr, src_stride, ref_ptr, ref_stride, msk_ptr,
+                        msk_stride, &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride, ref_ptr,
+                                                 ref_stride, msk_ptr,
+                                                 msk_stride, &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1) first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure;
+}
+
+typedef unsigned int (*MaskedSubPixelVarianceFunc)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, const uint8_t *m, int m_stride, unsigned int *sse);
+
+typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc>
+    MaskedSubPixelVarianceParam;
+
+class MaskedSubPixelVarianceTest
+    : public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
+ public:
+  virtual ~MaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 1);
+  int ref_stride = (MAX_SB_SIZE + 1);
+  int msk_stride = (MAX_SB_SIZE + 1);
+  int xoffset;
+  int yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int xoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+    int yoffsets[] = { 0, 4, rnd(BIL_SUBPEL_SHIFTS) };
+    for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+    for (int k = 0; k < 3; k++) {
+      xoffset = xoffsets[k];
+      for (int l = 0; l < 3; l++) {
+        xoffset = xoffsets[k];
+        yoffset = yoffsets[l];
+
+        ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                            ref_stride, msk_ptr, msk_stride, &ref_sse);
+        ASM_REGISTER_STATE_CHECK(
+            opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                                ref_stride, msk_ptr, msk_stride, &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) first_failure = i;
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+      << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+      << "C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 1);
+  int ref_stride = (MAX_SB_SIZE + 1);
+  int msk_stride = (MAX_SB_SIZE + 1);
+
+  for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+    for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+      for (int i = 0; i < 8; ++i) {
+        memset(src_ptr, (i & 0x1) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+        memset(msk_ptr, (i & 0x4) ? 64 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+
+        ref_ret = ref_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                            ref_stride, msk_ptr, msk_stride, &ref_sse);
+        ASM_REGISTER_STATE_CHECK(
+            opt_ret = opt_func_(src_ptr, src_stride, xoffset, yoffset, ref_ptr,
+                                ref_stride, msk_ptr, msk_stride, &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure
+                          << " x_offset = " << first_failure_x
+                          << " y_offset = " << first_failure_y;
+}
+
+#if CONFIG_AOM_HIGHBITDEPTH
+typedef std::tr1::tuple<MaskedVarianceFunc, MaskedVarianceFunc, aom_bit_depth_t>
+    HighbdMaskedVarianceParam;
+
+class HighbdMaskedVarianceTest
+    : public ::testing::TestWithParam<HighbdMaskedVarianceParam> {
+ public:
+  virtual ~HighbdMaskedVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedVarianceFunc opt_func_;
+  MaskedVarianceFunc ref_func_;
+  aom_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE * MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      msk_ptr[j] = rnd(65);
+    }
+
+    ref_ret = ref_func_(src8_ptr, src_stride, ref8_ptr, ref_stride, msk_ptr,
+                        msk_stride, &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride, ref8_ptr,
+                                                 ref_stride, msk_ptr,
+                                                 msk_stride, &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1) first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test OperationCheck,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure;
+}
+
+TEST_P(HighbdMaskedVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE * MAX_SB_SIZE]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < 8; ++i) {
+    aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+                 MAX_SB_SIZE * MAX_SB_SIZE);
+    aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+                 MAX_SB_SIZE * MAX_SB_SIZE);
+    memset(msk_ptr, (i & 0x4) ? 64 : 0, MAX_SB_SIZE * MAX_SB_SIZE);
+
+    ref_ret = ref_func_(src8_ptr, src_stride, ref8_ptr, ref_stride, msk_ptr,
+                        msk_stride, &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride, ref8_ptr,
+                                                 ref_stride, msk_ptr,
+                                                 msk_stride, &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1) first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure;
+}
+
+typedef std::tr1::tuple<MaskedSubPixelVarianceFunc, MaskedSubPixelVarianceFunc,
+                        aom_bit_depth_t>
+    HighbdMaskedSubPixelVarianceParam;
+
+class HighbdMaskedSubPixelVarianceTest
+    : public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
+ public:
+  virtual ~HighbdMaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+  aom_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int src_stride = (MAX_SB_SIZE + 1);
+  int ref_stride = (MAX_SB_SIZE + 1);
+  int msk_stride = (MAX_SB_SIZE + 1);
+  int xoffset, yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+      for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+        for (int j = 0; j < (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1); j++) {
+          src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+          ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+          msk_ptr[j] = rnd(65);
+        }
+
+        ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                            ref_stride, msk_ptr, msk_stride, &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret =
+                                     opt_func_(src8_ptr, src_stride, xoffset,
+                                               yoffset, ref8_ptr, ref_stride,
+                                               msk_ptr, msk_stride, &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+      << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+      << "C output doesn't match SSSE3 output. "
+      << "First failed at test case " << first_failure
+      << " x_offset = " << first_failure_x << " y_offset = " << first_failure_y;
+}
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[(MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1)]);
+  uint8_t *src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t *ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE + 1);
+  int ref_stride = (MAX_SB_SIZE + 1);
+  int msk_stride = (MAX_SB_SIZE + 1);
+
+  for (int xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+    for (int yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+      for (int i = 0; i < 8; ++i) {
+        aom_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+        aom_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+        memset(msk_ptr, (i & 0x4) ? 64 : 0,
+               (MAX_SB_SIZE + 1) * (MAX_SB_SIZE + 1));
+
+        ref_ret = ref_func_(src8_ptr, src_stride, xoffset, yoffset, ref8_ptr,
+                            ref_stride, msk_ptr, msk_stride, &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret =
+                                     opt_func_(src8_ptr, src_stride, xoffset,
+                                               yoffset, ref8_ptr, ref_stride,
+                                               msk_ptr, msk_stride, &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count) << "Error: Masked Variance Test ExtremeValues,"
+                          << "C output doesn't match SSSE3 output. "
+                          << "First failed at test case " << first_failure
+                          << " x_offset = " << first_failure_x
+                          << " y_offset = " << first_failure_y;
+}
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_C_COMPARE, MaskedVarianceTest,
+    ::testing::Values(
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_masked_variance128x128_ssse3,
+                   &aom_masked_variance128x128_c),
+        make_tuple(&aom_masked_variance128x64_ssse3,
+                   &aom_masked_variance128x64_c),
+        make_tuple(&aom_masked_variance64x128_ssse3,
+                   &aom_masked_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_masked_variance64x64_ssse3,
+                   &aom_masked_variance64x64_c),
+        make_tuple(&aom_masked_variance64x32_ssse3,
+                   &aom_masked_variance64x32_c),
+        make_tuple(&aom_masked_variance32x64_ssse3,
+                   &aom_masked_variance32x64_c),
+        make_tuple(&aom_masked_variance32x32_ssse3,
+                   &aom_masked_variance32x32_c),
+        make_tuple(&aom_masked_variance32x16_ssse3,
+                   &aom_masked_variance32x16_c),
+        make_tuple(&aom_masked_variance16x32_ssse3,
+                   &aom_masked_variance16x32_c),
+        make_tuple(&aom_masked_variance16x16_ssse3,
+                   &aom_masked_variance16x16_c),
+        make_tuple(&aom_masked_variance16x8_ssse3, &aom_masked_variance16x8_c),
+        make_tuple(&aom_masked_variance8x16_ssse3, &aom_masked_variance8x16_c),
+        make_tuple(&aom_masked_variance8x8_ssse3, &aom_masked_variance8x8_c),
+        make_tuple(&aom_masked_variance8x4_ssse3, &aom_masked_variance8x4_c),
+        make_tuple(&aom_masked_variance4x8_ssse3, &aom_masked_variance4x8_c),
+        make_tuple(&aom_masked_variance4x4_ssse3, &aom_masked_variance4x4_c)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
+    ::testing::Values(
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_masked_sub_pixel_variance128x128_ssse3,
+                   &aom_masked_sub_pixel_variance128x128_c),
+        make_tuple(&aom_masked_sub_pixel_variance128x64_ssse3,
+                   &aom_masked_sub_pixel_variance128x64_c),
+        make_tuple(&aom_masked_sub_pixel_variance64x128_ssse3,
+                   &aom_masked_sub_pixel_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_masked_sub_pixel_variance64x64_ssse3,
+                   &aom_masked_sub_pixel_variance64x64_c),
+        make_tuple(&aom_masked_sub_pixel_variance64x32_ssse3,
+                   &aom_masked_sub_pixel_variance64x32_c),
+        make_tuple(&aom_masked_sub_pixel_variance32x64_ssse3,
+                   &aom_masked_sub_pixel_variance32x64_c),
+        make_tuple(&aom_masked_sub_pixel_variance32x32_ssse3,
+                   &aom_masked_sub_pixel_variance32x32_c),
+        make_tuple(&aom_masked_sub_pixel_variance32x16_ssse3,
+                   &aom_masked_sub_pixel_variance32x16_c),
+        make_tuple(&aom_masked_sub_pixel_variance16x32_ssse3,
+                   &aom_masked_sub_pixel_variance16x32_c),
+        make_tuple(&aom_masked_sub_pixel_variance16x16_ssse3,
+                   &aom_masked_sub_pixel_variance16x16_c),
+        make_tuple(&aom_masked_sub_pixel_variance16x8_ssse3,
+                   &aom_masked_sub_pixel_variance16x8_c),
+        make_tuple(&aom_masked_sub_pixel_variance8x16_ssse3,
+                   &aom_masked_sub_pixel_variance8x16_c),
+        make_tuple(&aom_masked_sub_pixel_variance8x8_ssse3,
+                   &aom_masked_sub_pixel_variance8x8_c),
+        make_tuple(&aom_masked_sub_pixel_variance8x4_ssse3,
+                   &aom_masked_sub_pixel_variance8x4_c),
+        make_tuple(&aom_masked_sub_pixel_variance4x8_ssse3,
+                   &aom_masked_sub_pixel_variance4x8_c),
+        make_tuple(&aom_masked_sub_pixel_variance4x4_ssse3,
+                   &aom_masked_sub_pixel_variance4x4_c)));
+
+#if CONFIG_AOM_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_C_COMPARE, HighbdMaskedVarianceTest,
+    ::testing::Values(
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_masked_variance128x128_ssse3,
+                   &aom_highbd_masked_variance128x128_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance128x64_ssse3,
+                   &aom_highbd_masked_variance128x64_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance64x128_ssse3,
+                   &aom_highbd_masked_variance64x128_c, AOM_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_masked_variance64x64_ssse3,
+                   &aom_highbd_masked_variance64x64_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance64x32_ssse3,
+                   &aom_highbd_masked_variance64x32_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance32x64_ssse3,
+                   &aom_highbd_masked_variance32x64_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance32x32_ssse3,
+                   &aom_highbd_masked_variance32x32_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance32x16_ssse3,
+                   &aom_highbd_masked_variance32x16_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance16x32_ssse3,
+                   &aom_highbd_masked_variance16x32_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance16x16_ssse3,
+                   &aom_highbd_masked_variance16x16_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance16x8_ssse3,
+                   &aom_highbd_masked_variance16x8_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance8x16_ssse3,
+                   &aom_highbd_masked_variance8x16_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance8x8_ssse3,
+                   &aom_highbd_masked_variance8x8_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance8x4_ssse3,
+                   &aom_highbd_masked_variance8x4_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance4x8_ssse3,
+                   &aom_highbd_masked_variance4x8_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_variance4x4_ssse3,
+                   &aom_highbd_masked_variance4x4_c, AOM_BITS_8),
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_10_masked_variance128x128_ssse3,
+                   &aom_highbd_10_masked_variance128x128_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance128x64_ssse3,
+                   &aom_highbd_10_masked_variance128x64_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance64x128_ssse3,
+                   &aom_highbd_10_masked_variance64x128_c, AOM_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_10_masked_variance64x64_ssse3,
+                   &aom_highbd_10_masked_variance64x64_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance64x32_ssse3,
+                   &aom_highbd_10_masked_variance64x32_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance32x64_ssse3,
+                   &aom_highbd_10_masked_variance32x64_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance32x32_ssse3,
+                   &aom_highbd_10_masked_variance32x32_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance32x16_ssse3,
+                   &aom_highbd_10_masked_variance32x16_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance16x32_ssse3,
+                   &aom_highbd_10_masked_variance16x32_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance16x16_ssse3,
+                   &aom_highbd_10_masked_variance16x16_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance16x8_ssse3,
+                   &aom_highbd_10_masked_variance16x8_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance8x16_ssse3,
+                   &aom_highbd_10_masked_variance8x16_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance8x8_ssse3,
+                   &aom_highbd_10_masked_variance8x8_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance8x4_ssse3,
+                   &aom_highbd_10_masked_variance8x4_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance4x8_ssse3,
+                   &aom_highbd_10_masked_variance4x8_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_variance4x4_ssse3,
+                   &aom_highbd_10_masked_variance4x4_c, AOM_BITS_10),
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_12_masked_variance128x128_ssse3,
+                   &aom_highbd_12_masked_variance128x128_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance128x64_ssse3,
+                   &aom_highbd_12_masked_variance128x64_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance64x128_ssse3,
+                   &aom_highbd_12_masked_variance64x128_c, AOM_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_12_masked_variance64x64_ssse3,
+                   &aom_highbd_12_masked_variance64x64_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance64x32_ssse3,
+                   &aom_highbd_12_masked_variance64x32_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance32x64_ssse3,
+                   &aom_highbd_12_masked_variance32x64_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance32x32_ssse3,
+                   &aom_highbd_12_masked_variance32x32_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance32x16_ssse3,
+                   &aom_highbd_12_masked_variance32x16_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance16x32_ssse3,
+                   &aom_highbd_12_masked_variance16x32_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance16x16_ssse3,
+                   &aom_highbd_12_masked_variance16x16_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance16x8_ssse3,
+                   &aom_highbd_12_masked_variance16x8_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance8x16_ssse3,
+                   &aom_highbd_12_masked_variance8x16_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance8x8_ssse3,
+                   &aom_highbd_12_masked_variance8x8_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance8x4_ssse3,
+                   &aom_highbd_12_masked_variance8x4_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance4x8_ssse3,
+                   &aom_highbd_12_masked_variance4x8_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_variance4x4_ssse3,
+                   &aom_highbd_12_masked_variance4x4_c, AOM_BITS_12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+    ::testing::Values(
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_masked_sub_pixel_variance128x128_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance128x128_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance128x64_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance128x64_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance64x128_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance64x128_c, AOM_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_masked_sub_pixel_variance64x64_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance64x64_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance64x32_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance64x32_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance32x64_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance32x64_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance32x32_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance32x32_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance32x16_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance32x16_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance16x32_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance16x32_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance16x16_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance16x16_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance16x8_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance16x8_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance8x16_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance8x16_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance8x8_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance8x8_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance8x4_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance8x4_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance4x8_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance4x8_c, AOM_BITS_8),
+        make_tuple(&aom_highbd_masked_sub_pixel_variance4x4_ssse3,
+                   &aom_highbd_masked_sub_pixel_variance4x4_c, AOM_BITS_8),
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x128_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance128x128_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance128x64_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance128x64_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x128_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance64x128_c,
+                   AOM_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance64x64_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance64x32_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance32x64_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance32x32_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance32x16_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance16x32_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance16x16_c,
+                   AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance16x8_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance8x16_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance8x8_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance8x4_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance4x8_c, AOM_BITS_10),
+        make_tuple(&aom_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+                   &aom_highbd_10_masked_sub_pixel_variance4x4_c, AOM_BITS_10),
+#if CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x128_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance128x128_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance128x64_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance128x64_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x128_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance64x128_c,
+                   AOM_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance64x64_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance64x32_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance32x64_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance32x32_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance32x16_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance16x32_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance16x16_c,
+                   AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance16x8_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance8x16_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance8x8_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance8x4_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance4x8_c, AOM_BITS_12),
+        make_tuple(&aom_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+                   &aom_highbd_12_masked_sub_pixel_variance4x4_c,
+                   AOM_BITS_12)));
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#endif  // HAVE_SSSE3
+}  // namespace

diff --git a/test/minmax_test.cc b/test/minmax_test.cc
new file mode 100644
index 0000000..735f617
--- /dev/null
+++ b/test/minmax_test.cc

@@ -0,0 +1,130 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+
+namespace {
+
+using ::libaom_test::ACMRandom;
+
+typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int *min, int *max);
+
+class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
+ public:
+  virtual void SetUp() {
+    mm_func_ = GetParam();
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+ protected:
+  MinMaxFunc mm_func_;
+  ACMRandom rnd_;
+};
+
+void reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
+                      int b_stride, int *min_ret, int *max_ret) {
+  int min = 255;
+  int max = 0;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a[i * a_stride + j] - b[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(MinMaxTest, MinValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 255, sizeof(b));
+    b[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(255, max);
+    EXPECT_EQ(i, min);
+  }
+}
+
+TEST_P(MinMaxTest, MaxValue) {
+  for (int i = 0; i < 64; i++) {
+    uint8_t a[64], b[64];
+    memset(a, 0, sizeof(a));
+    memset(b, 0, sizeof(b));
+    b[i] = i;  // Set a maximum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+}
+
+TEST_P(MinMaxTest, CompareReference) {
+  uint8_t a[64], b[64];
+  for (int j = 0; j < 64; j++) {
+    a[j] = rnd_.Rand8();
+    b[j] = rnd_.Rand8();
+  }
+
+  int min_ref, max_ref, min, max;
+  reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t a[8 * 64], b[8 * 64];
+  for (int i = 0; i < 8 * 64; i++) {
+    a[i] = rnd_.Rand8();
+    b[i] = rnd_.Rand8();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
+                              << " and b_stride = " << b_stride;
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, MinMaxTest, ::testing::Values(&aom_minmax_8x8_c));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, MinMaxTest,
+                        ::testing::Values(&aom_minmax_8x8_sse2));
+#endif
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, MinMaxTest,
+                        ::testing::Values(&aom_minmax_8x8_neon));
+#endif
+
+}  // namespace

diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index e1f7652..dc7d800 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc

@@ -85,6 +85,11 @@
 
 #if HAVE_SSE4_1
 const ObmcSadTest::ParamType sse4_functions[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(aom_obmc_sad128x128_c, aom_obmc_sad128x128_sse4_1),
+  TestFuncs(aom_obmc_sad128x64_c, aom_obmc_sad128x64_sse4_1),
+  TestFuncs(aom_obmc_sad64x128_c, aom_obmc_sad64x128_sse4_1),
+#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_obmc_sad64x64_c, aom_obmc_sad64x64_sse4_1),
   TestFuncs(aom_obmc_sad64x32_c, aom_obmc_sad64x32_sse4_1),
   TestFuncs(aom_obmc_sad32x64_c, aom_obmc_sad32x64_sse4_1),
@@ -163,6 +168,11 @@
 
 #if HAVE_SSE4_1
 ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(aom_highbd_obmc_sad128x128_c, aom_highbd_obmc_sad128x128_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad128x64_c, aom_highbd_obmc_sad128x64_sse4_1),
+  TestFuncs(aom_highbd_obmc_sad64x128_c, aom_highbd_obmc_sad64x128_sse4_1),
+#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_obmc_sad64x64_c, aom_highbd_obmc_sad64x64_sse4_1),
   TestFuncs(aom_highbd_obmc_sad64x32_c, aom_highbd_obmc_sad64x32_sse4_1),
   TestFuncs(aom_highbd_obmc_sad32x64_c, aom_highbd_obmc_sad32x64_sse4_1),

diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
index 3c42bfa..9dcb50a 100644
--- a/test/obmc_variance_test.cc
+++ b/test/obmc_variance_test.cc

@@ -94,6 +94,11 @@
 
 #if HAVE_SSE4_1
 const ObmcVarianceTest::ParamType sse4_functions[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(aom_obmc_variance128x128_c, aom_obmc_variance128x128_sse4_1),
+  TestFuncs(aom_obmc_variance128x64_c, aom_obmc_variance128x64_sse4_1),
+  TestFuncs(aom_obmc_variance64x128_c, aom_obmc_variance64x128_sse4_1),
+#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_obmc_variance64x64_c, aom_obmc_variance64x64_sse4_1),
   TestFuncs(aom_obmc_variance64x32_c, aom_obmc_variance64x32_sse4_1),
   TestFuncs(aom_obmc_variance32x64_c, aom_obmc_variance32x64_sse4_1),
@@ -177,6 +182,14 @@
 
 #if HAVE_SSE4_1
 ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(aom_highbd_obmc_variance128x128_c,
+            aom_highbd_obmc_variance128x128_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance128x64_c,
+            aom_highbd_obmc_variance128x64_sse4_1, 8),
+  TestFuncs(aom_highbd_obmc_variance64x128_c,
+            aom_highbd_obmc_variance64x128_sse4_1, 8),
+#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_obmc_variance64x64_c,
             aom_highbd_obmc_variance64x64_sse4_1, 8),
   TestFuncs(aom_highbd_obmc_variance64x32_c,
@@ -203,6 +216,14 @@
             8),
   TestFuncs(aom_highbd_obmc_variance4x4_c, aom_highbd_obmc_variance4x4_sse4_1,
             8),
+#if CONFIG_EXT_PARTITION
+  TestFuncs(aom_highbd_10_obmc_variance128x128_c,
+            aom_highbd_10_obmc_variance128x128_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance128x64_c,
+            aom_highbd_10_obmc_variance128x64_sse4_1, 10),
+  TestFuncs(aom_highbd_10_obmc_variance64x128_c,
+            aom_highbd_10_obmc_variance64x128_sse4_1, 10),
+#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_10_obmc_variance64x64_c,
             aom_highbd_10_obmc_variance64x64_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance64x32_c,
@@ -229,6 +250,14 @@
             aom_highbd_10_obmc_variance4x8_sse4_1, 10),
   TestFuncs(aom_highbd_10_obmc_variance4x4_c,
             aom_highbd_10_obmc_variance4x4_sse4_1, 10),
+#if CONFIG_EXT_PARTITION
+  TestFuncs(aom_highbd_12_obmc_variance128x128_c,
+            aom_highbd_12_obmc_variance128x128_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance128x64_c,
+            aom_highbd_12_obmc_variance128x64_sse4_1, 12),
+  TestFuncs(aom_highbd_12_obmc_variance64x128_c,
+            aom_highbd_12_obmc_variance64x128_sse4_1, 12),
+#endif  // CONFIG_EXT_PARTITION
   TestFuncs(aom_highbd_12_obmc_variance64x64_c,
             aom_highbd_12_obmc_variance64x64_sse4_1, 12),
   TestFuncs(aom_highbd_12_obmc_variance64x32_c,

diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index ec945ee..4908e99 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc

@@ -9,339 +9,197 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 */
 
-#include <math.h>
-#include <stdlib.h>
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./aom_config.h"
-#include "./aom_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
-#include "av1/common/entropy.h"
-#include "av1/common/scan.h"
-#include "aom/aom_codec.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/onyx.h"
+#include "vp8/encoder/block.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
 #include "aom/aom_integer.h"
-
-using libaom_test::ACMRandom;
+#include "aom_mem/aom_mem.h"
 
 namespace {
 #if !CONFIG_AOM_QM
-#if CONFIG_AOM_HIGHBITDEPTH
-const int number_of_iterations = 100;
 
-typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             int skip_block, const int16_t *zbin,
-                             const int16_t *round, const int16_t *quant,
-                             const int16_t *quant_shift, tran_low_t *qcoeff,
-                             tran_low_t *dqcoeff, const int16_t *dequant,
-                             uint16_t *eob, const int16_t *scan,
-                             const int16_t *iscan);
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, aom_bit_depth_t>
-    QuantizeParam;
+const int kNumBlocks = 25;
+const int kNumBlockEntries = 16;
 
-class AV1QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
- public:
-  virtual ~AV1QuantizeTest() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-  }
+typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d);
 
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
+typedef std::tr1::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;
 
- protected:
-  aom_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
-};
-
-class AV1Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
- public:
-  virtual ~AV1Quantize32Test() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  aom_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
-};
-
-TEST_P(AV1QuantizeTest, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
-    const SCAN_ORDER *scan_order = &av1_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(AV1Quantize32Test, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = TX_32X32;
-    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const SCAN_ORDER *scan_order = &av1_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(AV1QuantizeTest, EOBCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
-    const SCAN_ORDER *scan_order = &av1_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    // Two random entries
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
-
-TEST_P(AV1Quantize32Test, EOBCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = TX_32X32;
-    TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const SCAN_ORDER *scan_order = &av1_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    // Two random entries
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
-    }
-    err_count_total += err_count;
-  }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
-}
+using libaom_test::ACMRandom;
 using std::tr1::make_tuple;
 
+// Create and populate a VP8_COMP instance which has a complete set of
+// quantization inputs as well as a second MACROBLOCKD for output.
+class QuantizeTestBase {
+ public:
+  virtual ~QuantizeTestBase() {
+    vp8_remove_compressor(&vp8_comp_);
+    vp8_comp_ = NULL;
+    aom_free(macroblockd_dst_);
+    macroblockd_dst_ = NULL;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void SetupCompressor() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    // The full configuration is necessary to generate the quantization tables.
+    VP8_CONFIG vp8_config;
+    memset(&vp8_config, 0, sizeof(vp8_config));
+
+    vp8_comp_ = vp8_create_compressor(&vp8_config);
+
+    // Set the tables based on a quantizer of 0.
+    vp8_set_quantizer(vp8_comp_, 0);
+
+    // Set up all the block/blockd pointers for the mb in vp8_comp_.
+    vp8cx_frame_init_quantizer(vp8_comp_);
+
+    // Copy macroblockd from the reference to get pre-set-up dequant values.
+    macroblockd_dst_ = reinterpret_cast<MACROBLOCKD *>(
+        aom_memalign(32, sizeof(*macroblockd_dst_)));
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    // Fix block pointers - currently they point to the blocks in the reference
+    // structure.
+    vp8_setup_block_dptrs(macroblockd_dst_);
+  }
+
+  void UpdateQuantizer(int q) {
+    vp8_set_quantizer(vp8_comp_, q);
+
+    memcpy(macroblockd_dst_, &vp8_comp_->mb.e_mbd, sizeof(*macroblockd_dst_));
+    vp8_setup_block_dptrs(macroblockd_dst_);
+  }
+
+  void FillCoeffConstant(int16_t c) {
+    for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) {
+      vp8_comp_->mb.coeff[i] = c;
+    }
+  }
+
+  void FillCoeffRandom() {
+    for (int i = 0; i < kNumBlocks * kNumBlockEntries; ++i) {
+      vp8_comp_->mb.coeff[i] = rnd_.Rand8();
+    }
+  }
+
+  void CheckOutput() {
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.qcoeff, macroblockd_dst_->qcoeff,
+                        sizeof(*macroblockd_dst_->qcoeff) * kNumBlocks *
+                            kNumBlockEntries))
+        << "qcoeff mismatch";
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.dqcoeff, macroblockd_dst_->dqcoeff,
+                        sizeof(*macroblockd_dst_->dqcoeff) * kNumBlocks *
+                            kNumBlockEntries))
+        << "dqcoeff mismatch";
+    EXPECT_EQ(0, memcmp(vp8_comp_->mb.e_mbd.eobs, macroblockd_dst_->eobs,
+                        sizeof(*macroblockd_dst_->eobs) * kNumBlocks))
+        << "eobs mismatch";
+  }
+
+  VP8_COMP *vp8_comp_;
+  MACROBLOCKD *macroblockd_dst_;
+
+ private:
+  ACMRandom rnd_;
+};
+
+class QuantizeTest : public QuantizeTestBase,
+                     public ::testing::TestWithParam<VP8QuantizeParam> {
+ protected:
+  virtual void SetUp() {
+    SetupCompressor();
+    asm_quant_ = GET_PARAM(0);
+    c_quant_ = GET_PARAM(1);
+  }
+
+  void RunComparison() {
+    for (int i = 0; i < kNumBlocks; ++i) {
+      ASM_REGISTER_STATE_CHECK(
+          c_quant_(&vp8_comp_->mb.block[i], &vp8_comp_->mb.e_mbd.block[i]));
+      ASM_REGISTER_STATE_CHECK(
+          asm_quant_(&vp8_comp_->mb.block[i], &macroblockd_dst_->block[i]));
+    }
+
+    CheckOutput();
+  }
+
+ private:
+  VP8Quantize asm_quant_;
+  VP8Quantize c_quant_;
+};
+
+TEST_P(QuantizeTest, TestZeroInput) {
+  FillCoeffConstant(0);
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestLargeNegativeInput) {
+  FillCoeffConstant(0);
+  // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues
+  // like BUG=883 where the constant being compared was incorrectly initialized.
+  vp8_comp_->mb.coeff[0] = -8191;
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestRandomInput) {
+  FillCoeffRandom();
+  RunComparison();
+}
+
+TEST_P(QuantizeTest, TestMultipleQ) {
+  for (int q = 0; q < QINDEX_RANGE; ++q) {
+    UpdateQuantizer(q);
+    FillCoeffRandom();
+    RunComparison();
+  }
+}
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
-    SSE2, AV1QuantizeTest,
-    ::testing::Values(make_tuple(&aom_highbd_quantize_b_sse2,
-                                 &aom_highbd_quantize_b_c, AOM_BITS_8),
-                      make_tuple(&aom_highbd_quantize_b_sse2,
-                                 &aom_highbd_quantize_b_c, AOM_BITS_10),
-                      make_tuple(&aom_highbd_quantize_b_sse2,
-                                 &aom_highbd_quantize_b_c, AOM_BITS_12)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, AV1Quantize32Test,
-    ::testing::Values(make_tuple(&aom_highbd_quantize_b_32x32_sse2,
-                                 &aom_highbd_quantize_b_32x32_c, AOM_BITS_8),
-                      make_tuple(&aom_highbd_quantize_b_32x32_sse2,
-                                 &aom_highbd_quantize_b_32x32_c, AOM_BITS_10),
-                      make_tuple(&aom_highbd_quantize_b_32x32_sse2,
-                                 &aom_highbd_quantize_b_32x32_c, AOM_BITS_12)));
+    SSE2, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_sse2, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_sse2, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_SSE2
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, QuantizeTest,
+                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_ssse3,
+                                                     &vp8_fast_quantize_b_c)));
+#endif  // HAVE_SSSE3
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_regular_quantize_b_sse4_1,
+                                 &vp8_regular_quantize_b_c)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, QuantizeTest,
+                        ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon,
+                                                     &vp8_fast_quantize_b_c)));
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(
+    MSA, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MSA
 #endif  // CONFIG_AOM_QM
 }  // namespace

diff --git a/test/realtime_test.cc b/test/realtime_test.cc
new file mode 100644
index 0000000..0c99291
--- /dev/null
+++ b/test/realtime_test.cc

@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 2;
+
+class RealtimeTest
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWithParam<libaom_test::TestMode> {
+ protected:
+  RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
+  virtual ~RealtimeTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    cfg_.g_lag_in_frames = 0;
+    SetMode(::libaom_test::kRealTime);
+  }
+
+  virtual void BeginPassHook(unsigned int /*pass*/) {
+    // TODO(tomfinegan): We're changing the pass value here to make sure
+    // we get frames when real time mode is combined with |g_pass| set to
+    // AOM_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
+    // the pass value based on the mode passed into EncoderTest::SetMode(),
+    // which overrides the one specified in SetUp() above.
+    cfg_.g_pass = AOM_RC_FIRST_PASS;
+  }
+  virtual void FramePktHook(const aom_codec_cx_pkt_t * /*pkt*/) {
+    frame_packets_++;
+  }
+
+  int frame_packets_;
+};
+
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
+  ::libaom_test::RandomVideoSource video;
+  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+  video.set_limit(kFramesToEncode);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  EXPECT_EQ(kFramesToEncode, frame_packets_);
+}
+
+AV1_INSTANTIATE_TEST_CASE(RealtimeTest,
+                          ::testing::Values(::libaom_test::kRealTime));
+
+}  // namespace

diff --git a/test/register_state_check.h b/test/register_state_check.h
index 58223f9..3ff41d1 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h

@@ -37,16 +37,10 @@
 #include <windows.h>
 #include <winnt.h>
 
-namespace testing {
-namespace internal {
-
 inline bool operator==(const M128A &lhs, const M128A &rhs) {
   return (lhs.Low == rhs.Low && lhs.High == rhs.High);
 }
 
-}  // namespace internal
-}  // namespace testing
-
 namespace libaom_test {
 
 // Compares the state of xmm[6-15] at construction with their state at

diff --git a/test/resize_test.cc b/test/resize_test.cc
index a36c18e..9cee841 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc

@@ -91,29 +91,175 @@
   unsigned int h;
 };
 
-unsigned int ScaleForFrameNumber(unsigned int frame, unsigned int val) {
-  if (frame < 10) return val;
-  if (frame < 20) return val / 2;
-  if (frame < 30) return val * 2 / 3;
-  if (frame < 40) return val / 4;
-  if (frame < 50) return val * 7 / 8;
-  return val;
+void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
+                         unsigned int initial_h, unsigned int *w,
+                         unsigned int *h, int flag_codec) {
+  if (frame < 10) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 20) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 30) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 40) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 50) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 60) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 70) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 80) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 90) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 100) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 110) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 120) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 130) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 140) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 150) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 160) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 170) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 180) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 190) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 200) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 210) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 220) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 230) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  if (frame < 240) {
+    *w = initial_w * 3 / 4;
+    *h = initial_h * 3 / 4;
+    return;
+  }
+  if (frame < 250) {
+    *w = initial_w / 2;
+    *h = initial_h / 2;
+    return;
+  }
+  if (frame < 260) {
+    *w = initial_w;
+    *h = initial_h;
+    return;
+  }
+  // Go down very low.
+  if (frame < 270) {
+    *w = initial_w / 4;
+    *h = initial_h / 4;
+    return;
+  }
+  if (flag_codec == 1) {
+    // Cases that only works for AV1.
+    // For AV1: Swap width and height of original.
+    if (frame < 320) {
+      *w = initial_h;
+      *h = initial_w;
+      return;
+    }
+  }
+  *w = initial_w;
+  *h = initial_h;
 }
 
 class ResizingVideoSource : public ::libaom_test::DummyVideoSource {
  public:
   ResizingVideoSource() {
     SetSize(kInitialWidth, kInitialHeight);
-    limit_ = 60;
+    limit_ = 350;
   }
-
+  int flag_codec_;
   virtual ~ResizingVideoSource() {}
 
  protected:
   virtual void Next() {
     ++frame_;
-    SetSize(ScaleForFrameNumber(frame_, kInitialWidth),
-            ScaleForFrameNumber(frame_, kInitialHeight));
+    unsigned int width;
+    unsigned int height;
+    ScaleForFrameNumber(frame_, kInitialWidth, kInitialHeight, &width, &height,
+                        flag_codec_);
+    SetSize(width, height);
     FillFrame();
   }
 };
@@ -141,15 +287,17 @@
 
 TEST_P(ResizeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
+  video.flag_codec_ = 0;
   cfg_.g_lag_in_frames = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
-    const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
-    const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
-
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
+                        &expected_h, 0);
     EXPECT_EQ(expected_w, info->w) << "Frame " << frame
                                    << " had unexpected width";
     EXPECT_EQ(expected_h, info->h) << "Frame " << frame
@@ -215,7 +363,7 @@
   }
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
-    if (!frame0_psnr_) frame0_psnr_ = pkt->data.psnr.psnr[0];
+    if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
@@ -310,6 +458,14 @@
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
+  virtual void MismatchHook(const aom_image_t *img1, const aom_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+
   void DefaultConfig() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
@@ -335,24 +491,33 @@
   std::vector<FrameInfo> frame_info_list_;
   int set_cpu_used_;
   bool change_bitrate_;
+  double mismatch_psnr_;
+  int mismatch_nframes_;
 };
 
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   ResizingVideoSource video;
+  video.flag_codec_ = 1;
   DefaultConfig();
+  // Disable internal resize for this test.
+  cfg_.rc_resize_allowed = 0;
   change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
-    const unsigned int expected_w = ScaleForFrameNumber(frame, kInitialWidth);
-    const unsigned int expected_h = ScaleForFrameNumber(frame, kInitialHeight);
-
+    unsigned int expected_w;
+    unsigned int expected_h;
+    ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
+                        &expected_h, 1);
     EXPECT_EQ(expected_w, info->w) << "Frame " << frame
                                    << " had unexpected width";
     EXPECT_EQ(expected_h, info->h) << "Frame " << frame
                                    << " had unexpected height";
+    EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
   }
 }
 
@@ -366,6 +531,8 @@
   cfg_.g_w = 352;
   cfg_.g_h = 288;
   change_bitrate_ = false;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   unsigned int last_w = cfg_.g_w;
@@ -383,8 +550,13 @@
     }
   }
 
+#if CONFIG_AV1_DECODER
   // Verify that we get 1 resize down event in this test.
   ASSERT_EQ(1, resize_count) << "Resizing should occur.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
 }
 
 // Verify the dynamic resizer behavior for real time, 1 pass CBR mode.
@@ -397,6 +569,8 @@
   cfg_.g_w = 352;
   cfg_.g_h = 288;
   change_bitrate_ = true;
+  mismatch_psnr_ = 0.0;
+  mismatch_nframes_ = 0;
   // Disable dropped frames.
   cfg_.rc_dropframe_thresh = 0;
   // Starting bitrate low.
@@ -424,8 +598,13 @@
     }
   }
 
+#if CONFIG_AV1_DECODER
   // Verify that we get 2 resize events in this test.
   ASSERT_EQ(resize_count, 2) << "Resizing should occur twice.";
+  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
+#else
+  printf("Warning: AV1 decoder unavailable, unable to check resize count!\n");
+#endif
 }
 
 aom_img_fmt_t CspForFrameNumber(int frame) {
@@ -477,7 +656,7 @@
   }
 
   virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
-    if (!frame0_psnr_) frame0_psnr_ = pkt->data.psnr.psnr[0];
+    if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 

diff --git a/test/sad_test.cc b/test/sad_test.cc
index ddc2422..b777658 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc

@@ -53,13 +53,13 @@
     reference_data8_ = reinterpret_cast<uint8_t *>(
         aom_memalign(kDataAlignment, kDataBufferSize));
     second_pred8_ =
-        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 64 * 64));
+        reinterpret_cast<uint8_t *>(aom_memalign(kDataAlignment, 128 * 128));
     source_data16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, kDataBlockSize * sizeof(uint16_t)));
     reference_data16_ = reinterpret_cast<uint16_t *>(
         aom_memalign(kDataAlignment, kDataBufferSize * sizeof(uint16_t)));
     second_pred16_ = reinterpret_cast<uint16_t *>(
-        aom_memalign(kDataAlignment, 64 * 64 * sizeof(uint16_t)));
+        aom_memalign(kDataAlignment, 128 * 128 * sizeof(uint16_t)));
   }
 
   static void TearDownTestCase() {
@@ -80,9 +80,9 @@
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
  protected:
-  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  // Handle up to 4 128x128 blocks, with stride up to 256
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBlockSize = 128 * 256;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
   virtual void SetUp() {
@@ -473,6 +473,11 @@
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_sad128x128_c, -1),
+  make_tuple(128, 64, &aom_sad128x64_c, -1),
+  make_tuple(64, 128, &aom_sad64x128_c, -1),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_c, -1),
   make_tuple(64, 32, &aom_sad64x32_c, -1),
   make_tuple(32, 64, &aom_sad32x64_c, -1),
@@ -487,6 +492,11 @@
   make_tuple(4, 8, &aom_sad4x8_c, -1),
   make_tuple(4, 4, &aom_sad4x4_c, -1),
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 8),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_c, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_c, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_c, 8),
@@ -500,6 +510,11 @@
   make_tuple(8, 4, &aom_highbd_sad8x4_c, 8),
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 8),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 8),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 10),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_c, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_c, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_c, 10),
@@ -513,6 +528,11 @@
   make_tuple(8, 4, &aom_highbd_sad8x4_c, 10),
   make_tuple(4, 8, &aom_highbd_sad4x8_c, 10),
   make_tuple(4, 4, &aom_highbd_sad4x4_c, 10),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_c, 12),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_c, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_c, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_c, 12),
@@ -531,6 +551,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_c, -1),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_avg_c, -1),
   make_tuple(64, 32, &aom_sad64x32_avg_c, -1),
   make_tuple(32, 64, &aom_sad32x64_avg_c, -1),
@@ -545,6 +570,11 @@
   make_tuple(4, 8, &aom_sad4x8_avg_c, -1),
   make_tuple(4, 4, &aom_sad4x4_avg_c, -1),
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 8),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 8),
@@ -558,6 +588,11 @@
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 8),
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 8),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 8),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 10),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 10),
@@ -571,6 +606,11 @@
   make_tuple(8, 4, &aom_highbd_sad8x4_avg_c, 10),
   make_tuple(4, 8, &aom_highbd_sad4x8_avg_c, 10),
   make_tuple(4, 4, &aom_highbd_sad4x4_avg_c, 10),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128_avg_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64_avg_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128_avg_c, 12),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64_avg_c, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32_avg_c, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64_avg_c, 12),
@@ -589,6 +629,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
 const SadMxNx4Param x4d_c_tests[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_c, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_c, -1),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64x4d_c, -1),
   make_tuple(64, 32, &aom_sad64x32x4d_c, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_c, -1),
@@ -603,6 +648,11 @@
   make_tuple(4, 8, &aom_sad4x8x4d_c, -1),
   make_tuple(4, 4, &aom_sad4x4x4d_c, -1),
 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 8),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 8),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 8),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 8),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 8),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 8),
@@ -616,6 +666,11 @@
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 8),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 8),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 8),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 10),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 10),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 10),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 10),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 10),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 10),
@@ -629,6 +684,11 @@
   make_tuple(8, 4, &aom_highbd_sad8x4x4d_c, 10),
   make_tuple(4, 8, &aom_highbd_sad4x8x4d_c, 10),
   make_tuple(4, 4, &aom_highbd_sad4x4x4d_c, 10),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_highbd_sad128x128x4d_c, 12),
+  make_tuple(128, 64, &aom_highbd_sad128x64x4d_c, 12),
+  make_tuple(64, 128, &aom_highbd_sad64x128x4d_c, 12),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_highbd_sad64x64x4d_c, 12),
   make_tuple(64, 32, &aom_highbd_sad64x32x4d_c, 12),
   make_tuple(32, 64, &aom_highbd_sad32x64x4d_c, 12),
@@ -677,9 +737,13 @@
 
 //------------------------------------------------------------------------------
 // x86 functions
-
 #if HAVE_SSE2
 const SadMxNParam sse2_tests[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_sad128x128_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128_sse2, -1),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_sse2, -1),
   make_tuple(64, 32, &aom_sad64x32_sse2, -1),
   make_tuple(32, 64, &aom_sad32x64_sse2, -1),
@@ -732,6 +796,11 @@
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 const SadMxNAvgParam avg_sse2_tests[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_sad128x128_avg_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64_avg_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128_avg_sse2, -1),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64_avg_sse2, -1),
   make_tuple(64, 32, &aom_sad64x32_avg_sse2, -1),
   make_tuple(32, 64, &aom_sad32x64_avg_sse2, -1),
@@ -784,6 +853,11 @@
 INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
 const SadMxNx4Param x4d_sse2_tests[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &aom_sad128x128x4d_sse2, -1),
+  make_tuple(128, 64, &aom_sad128x64x4d_sse2, -1),
+  make_tuple(64, 128, &aom_sad64x128x4d_sse2, -1),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &aom_sad64x64x4d_sse2, -1),
   make_tuple(64, 32, &aom_sad64x32x4d_sse2, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_sse2, -1),

diff --git a/test/scan_test.cc b/test/scan_test.cc
index 85fffb9..22a6d85 100644
--- a/test/scan_test.cc
+++ b/test/scan_test.cc

@@ -9,75 +9,72 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
 #include "av1/common/scan.h"
-#include "test/acm_random.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-using libaom_test::ACMRandom;
-
 namespace {
 
-TEST(scan_test, av1_augment_prob) {
-  int tx1d_size = 4;
+TEST(ScanTest, av1_augment_prob) {
+  const int tx1d_size = 4;
   uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
-  uint32_t ref_prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  const uint32_t ref_prob[16] = {
+    8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2
+  };
   av1_augment_prob(prob, tx1d_size, tx1d_size);
   for (int r = 0; r < tx1d_size; ++r) {
     for (int c = 0; c < tx1d_size; ++c) {
-      int idx = r * tx1d_size + c;
+      const int idx = r * tx1d_size + c;
       EXPECT_EQ(ref_prob[idx], prob[idx] >> 16);
     }
   }
 
-  int mask = (1 << 10) - 1;
+  const int mask = (1 << 10) - 1;
   for (int r = 0; r < tx1d_size; ++r) {
     for (int c = 0; c < tx1d_size; ++c) {
-      int idx = r * tx1d_size + c;
+      const int idx = r * tx1d_size + c;
       EXPECT_EQ(idx, mask ^ (prob[r * tx1d_size + c] & mask));
     }
   }
 }
 
-TEST(scan_test, av1_update_sort_order) {
-  int tx_size = TX_4X4;
-  uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
-  int16_t ref_sort_order[16] = { 0, 1,  4, 5,  2,  3,  6,  8,
-                                 9, 12, 7, 10, 13, 11, 14, 15 };
+TEST(ScanTest, av1_update_sort_order) {
+  const TX_SIZE tx_size = TX_4X4;
+  const uint32_t prob[16] = { 8, 8, 7, 7, 8, 8, 4, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+  const int16_t ref_sort_order[16] = { 0, 1,  4, 5,  2,  3,  6,  8,
+                                       9, 12, 7, 10, 13, 11, 14, 15 };
   int16_t sort_order[16];
   av1_update_sort_order(tx_size, prob, sort_order);
   for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_sort_order[i], sort_order[i]);
 }
 
-TEST(scan_test, av1_update_scan_order) {
-  int tx_size = TX_4X4;
-  uint32_t prob[16] = { 4, 5, 7, 4, 5, 6, 8, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
+TEST(ScanTest, av1_update_scan_order) {
+  TX_SIZE tx_size = TX_4X4;
+  const uint32_t prob[16] = { 4, 5, 7, 4, 5, 6, 8, 2, 3, 3, 2, 2, 2, 2, 2, 2 };
   int16_t sort_order[16];
   int16_t scan[16];
   int16_t iscan[16];
-  int16_t ref_iscan[16] = {
-    0, 1, 2, 6, 3, 4, 5, 10, 7, 8, 11, 13, 9, 12, 14, 15
-  };
+  const int16_t ref_iscan[16] = { 0, 1, 2,  6,  3, 4,  5,  10,
+                                  7, 8, 11, 13, 9, 12, 14, 15 };
 
   av1_update_sort_order(tx_size, prob, sort_order);
   av1_update_scan_order(tx_size, sort_order, scan, iscan);
 
-  for (int i = 0; i < 16; ++i) EXPECT_EQ(ref_iscan[i], iscan[i]);
-
-  for (int i = 0; i < 16; ++i) EXPECT_EQ(i, scan[ref_iscan[i]]);
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(ref_iscan[i], iscan[i]);
+    EXPECT_EQ(i, scan[ref_iscan[i]]);
+  }
 }
 
-TEST(scan_test, av1_update_neighbors) {
-  int tx_size = TX_4X4;
+TEST(ScanTest, av1_update_neighbors) {
+  TX_SIZE tx_size = TX_4X4;
   // raster order
-  int16_t scan[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+  const int16_t scan[16] = { 0, 1, 2,  3,  4,  5,  6,  7,
+                             8, 9, 10, 11, 12, 13, 14, 15 };
   int16_t nb[(16 + 1) * 2];
-  int16_t ref_nb[(16 + 1) * 2] = { 0, 0, 0,  0, 1,  1,  2,  2,  0, 0, 4,  1,
-                                   5, 2, 6,  3, 4,  4,  8,  5,  9, 6, 10, 7,
-                                   8, 8, 12, 9, 13, 10, 14, 11, 0, 0 };
+  const int16_t ref_nb[(16 + 1) * 2] = { 0, 0,  0,  0,  1,  1, 2, 2, 0,
+                                         0, 4,  1,  5,  2,  6, 3, 4, 4,
+                                         8, 5,  9,  6,  10, 7, 8, 8, 12,
+                                         9, 13, 10, 14, 11, 0, 0 };
 
   // raster order's scan and iscan are the same
   av1_update_neighbors(tx_size, scan, scan, nb);

diff --git a/test/simple_encoder.sh b/test/simple_encoder.sh
index c577327..5cd6b46 100755
--- a/test/simple_encoder.sh
+++ b/test/simple_encoder.sh

@@ -23,7 +23,7 @@
   fi
 }
 
-# Runs simple_encoder using the codec specified by $1.
+# Runs simple_encoder using the codec specified by $1 with a frame limit of 100.
 simple_encoder() {
   local encoder="${LIBAOM_BIN_PATH}/simple_encoder${AOM_TEST_EXE_SUFFIX}"
   local codec="$1"

diff --git a/test/subtract_test.cc b/test/subtract_test.cc
index 8905acf..6bd1b2b 100644
--- a/test/subtract_test.cc
+++ b/test/subtract_test.cc

@@ -11,21 +11,26 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./av1_rtcd.h"
 #include "./aom_config.h"
 #include "./aom_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
+#if CONFIG_AV1
 #include "av1/common/blockd.h"
+#endif
 #include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#define USE_SPEED_TEST (0)
 
 typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                              ptrdiff_t diff_stride, const uint8_t *src_ptr,
                              ptrdiff_t src_stride, const uint8_t *pred_ptr,
                              ptrdiff_t pred_stride);
 
-namespace av1 {
+namespace {
 
 class AV1SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
  public:
@@ -102,4 +107,146 @@
                         ::testing::Values(aom_subtract_block_msa));
 #endif
 
-}  // namespace av1
+typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                                ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                                ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                                ptrdiff_t pred_stride, int bd);
+
+using ::std::tr1::get;
+using ::std::tr1::make_tuple;
+using ::std::tr1::tuple;
+
+// <width, height, bit_dpeth, subtract>
+typedef tuple<int, int, int, HBDSubtractFunc> Params;
+
+#if CONFIG_AOM_HIGHBITDEPTH
+class AV1HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = GET_PARAM(0);
+    block_height_ = GET_PARAM(1);
+    bit_depth_ = static_cast<aom_bit_depth_t>(GET_PARAM(2));
+    func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    const size_t max_width = 128;
+    const size_t max_block_size = max_width * max_width;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        aom_memalign(16, max_block_size * sizeof(uint16_t))));
+    diff_ = reinterpret_cast<int16_t *>(
+        aom_memalign(16, max_block_size * sizeof(int16_t)));
+  }
+
+  virtual void TearDown() {
+    aom_free(CONVERT_TO_SHORTPTR(src_));
+    aom_free(CONVERT_TO_SHORTPTR(pred_));
+    aom_free(diff_);
+  }
+
+ protected:
+  void RunForSpeed();
+  void CheckResult();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  aom_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void AV1HBDSubtractBlockTest::RunForSpeed() {
+  const int test_num = 200000;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  for (i = 0; i < test_num; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+}
+
+void AV1HBDSubtractBlockTest::CheckResult() {
+  const int test_num = 100;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (i = 0; i < test_num; ++i) {
+    for (j = 0; j < max_block_size; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(AV1HBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+#if USE_SPEED_TEST
+TEST_P(AV1HBDSubtractBlockTest, CheckSpeed) { RunForSpeed(); }
+#endif  // USE_SPEED_TEST
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AV1HBDSubtractBlockTest,
+    ::testing::Values(make_tuple(4, 4, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(4, 4, 12, aom_highbd_subtract_block_c),
+                      make_tuple(4, 8, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(4, 8, 12, aom_highbd_subtract_block_c),
+                      make_tuple(8, 4, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(8, 4, 12, aom_highbd_subtract_block_c),
+                      make_tuple(8, 8, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(8, 8, 12, aom_highbd_subtract_block_c),
+                      make_tuple(8, 16, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(8, 16, 12, aom_highbd_subtract_block_c),
+                      make_tuple(16, 8, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(16, 8, 12, aom_highbd_subtract_block_c),
+                      make_tuple(16, 16, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(16, 16, 12, aom_highbd_subtract_block_c),
+                      make_tuple(16, 32, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(16, 32, 12, aom_highbd_subtract_block_c),
+                      make_tuple(32, 16, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(32, 16, 12, aom_highbd_subtract_block_c),
+                      make_tuple(32, 32, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(32, 32, 12, aom_highbd_subtract_block_c),
+                      make_tuple(32, 64, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(32, 64, 12, aom_highbd_subtract_block_c),
+                      make_tuple(64, 32, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(64, 32, 12, aom_highbd_subtract_block_c),
+                      make_tuple(64, 64, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(64, 64, 12, aom_highbd_subtract_block_c),
+                      make_tuple(64, 128, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(64, 128, 12, aom_highbd_subtract_block_c),
+                      make_tuple(128, 64, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(128, 64, 12, aom_highbd_subtract_block_c),
+                      make_tuple(128, 128, 12, aom_highbd_subtract_block_sse2),
+                      make_tuple(128, 128, 12, aom_highbd_subtract_block_c)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+}  // namespace

diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
new file mode 100644
index 0000000..d651072
--- /dev/null
+++ b/test/sum_squares_test.cc

@@ -0,0 +1,186 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libaom_test::ACMRandom;
+using libaom_test::FunctionEquivalenceTest;
+
+namespace {
+const int kNumIterations = 10000;
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);
+typedef libaom_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSquaresTest : public ::testing::TestWithParam<TestFuncs> {
+ public:
+  virtual ~SumSquaresTest() {}
+  virtual void SetUp() { params_ = this->GetParam(); }
+
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+ protected:
+  TestFuncs params_;
+};
+
+TEST_P(SumSquaresTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
+
+  int failed = 0;
+
+  const int msb = 11;  // Up to 12 bit input
+  const int limit = 1 << (msb + 1);
+
+  for (int k = 0; k < kNumIterations; k++) {
+    int size = 4 << rnd(6);    // Up to 128x128
+    int stride = 4 << rnd(7);  // Up to 256 stride
+    while (stride < size) {    // Make sure it's valid
+      stride = 4 << rnd(7);
+    }
+
+    for (int ii = 0; ii < size; ii++) {
+      for (int jj = 0; jj < size; jj++) {
+        src[ii * stride + jj] = rnd(2) ? rnd(limit) : -rnd(limit);
+      }
+    }
+
+    const uint64_t res_ref = params_.ref_func(src, stride, size);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst = params_.tst_func(src, stride, size));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test"
+          << " C output does not match optimized output.";
+    }
+  }
+}
+
+TEST_P(SumSquaresTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, int16_t, src[256 * 256]);
+
+  int failed = 0;
+
+  const int msb = 11;  // Up to 12 bit input
+  const int limit = 1 << (msb + 1);
+
+  for (int k = 0; k < kNumIterations; k++) {
+    int size = 4 << rnd(6);    // Up to 128x128
+    int stride = 4 << rnd(7);  // Up to 256 stride
+    while (stride < size) {    // Make sure it's valid
+      stride = 4 << rnd(7);
+    }
+
+    int val = rnd(2) ? limit - 1 : -(limit - 1);
+    for (int ii = 0; ii < size; ii++) {
+      for (int jj = 0; jj < size; jj++) {
+        src[ii * stride + jj] = val;
+      }
+    }
+
+    const uint64_t res_ref = params_.ref_func(src, stride, size);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst = params_.tst_func(src, stride, size));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+          << "Error: Sum Squares Test"
+          << " C output does not match optimized output.";
+    }
+  }
+}
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SumSquaresTest,
+    ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c,
+                                &aom_sum_squares_2d_i16_sse2)));
+
+#endif  // HAVE_SSE2
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*F1D)(const int16_t *src, uint32_t N);
+typedef libaom_test::FuncParam<F1D> TestFuncs1D;
+
+class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
+ protected:
+  static const int kIterations = 1000;
+  static const int kMaxSize = 256;
+};
+
+TEST_P(SumSquares1DTest, RandomValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0; i < kMaxSize * kMaxSize; ++i)
+      src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(SumSquares1DTest, ExtremeValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = kInt13Max;
+    } else {
+      for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max;
+    }
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, SumSquares1DTest,
+                        ::testing::Values(TestFuncs1D(
+                            aom_sum_squares_i16_c, aom_sum_squares_i16_sse2)));
+
+#endif  // HAVE_SSE2
+}  // namespace

diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index bfa47c9..94f4be9 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc

@@ -20,8 +20,11 @@
 
 const int kTestMode = 0;
 const int kSuperframeSyntax = 1;
+const int kTileCols = 2;
+const int kTileRows = 3;
 
-typedef std::tr1::tuple<libaom_test::TestMode, int> SuperframeTestParam;
+typedef std::tr1::tuple<libaom_test::TestMode, int, int, int>
+    SuperframeTestParam;
 
 class SuperframeTest
     : public ::libaom_test::EncoderTest,
@@ -40,6 +43,8 @@
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
     is_av1_style_superframe_ = syntax;
+    n_tile_cols_ = std::tr1::get<kTileCols>(input);
+    n_tile_rows_ = std::tr1::get<kTileRows>(input);
   }
 
   virtual void TearDown() { delete[] modified_buf_; }
@@ -48,6 +53,9 @@
                                   libaom_test::Encoder *encoder) {
     if (video->frame() == 1) {
       encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(AOME_SET_CPUUSED, 2);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
     }
   }
 
@@ -87,6 +95,10 @@
   aom_codec_cx_pkt_t modified_pkt_;
   uint8_t *modified_buf_;
   aom_codec_pts_t last_sf_pts_;
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
 };
 
 TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
@@ -105,34 +117,29 @@
 #endif  // CONFIG_EXT_REFS
 }
 
+// The superframe index is currently mandatory with ANS due to the decoder
+// starting at the end of the buffer.
+#if CONFIG_EXT_TILE
+// Single tile does not work with ANS (see comment above).
 #if CONFIG_ANS
-// TODO(aconverse@google.com): Because the ANS decoder starts reading from the
-// end of the buffer, it can't find the end of the first frame of the
-// superframe. This can be ameliorated by reversing the order of the frames in
-// the superframe or reversing the bytes of each ANS buffer.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_AV1, SuperframeTest,
-    ::testing::Combine(
-        ::testing::Values(
-            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
-        ::testing::Combine(::testing::Values(::libaom_test::kTwoPassGood),
-                           ::testing::Values(1))));
-#elif CONFIG_DAALA_EC
-// TODO(negge@mozilla.com): Because the Daala EC decoder reads raw bits in
-// reverse order from the rear of the entropy coder buffer, it cannot decode
-// a superframe without knowing the length of the frame.  This should be
-// handled by some higher level syntax that does not exist yet.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_AV1, SuperframeTest,
-    ::testing::Combine(
-        ::testing::Values(
-            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
-        ::testing::Combine(::testing::Values(::libaom_test::kTwoPassGood),
-                           ::testing::Values(1))));
+const int tile_col_values[] = { 1, 2 };
 #else
+const int tile_col_values[] = { 1, 2, 32 };
+#endif
+const int tile_row_values[] = { 1, 2, 32 };
 AV1_INSTANTIATE_TEST_CASE(
     SuperframeTest,
     ::testing::Combine(::testing::Values(::libaom_test::kTwoPassGood),
-                       ::testing::Values(1)));
-#endif
+                       ::testing::Values(1),
+                       ::testing::ValuesIn(tile_col_values),
+                       ::testing::ValuesIn(tile_row_values)));
+#else
+#if !CONFIG_ANS && !CONFIG_DAALA_EC
+AV1_INSTANTIATE_TEST_CASE(
+    SuperframeTest,
+    ::testing::Combine(::testing::Values(::libaom_test::kTwoPassGood),
+                       ::testing::Values(1), ::testing::Values(0),
+                       ::testing::Values(0)));
+#endif  // !CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
 }  // namespace

diff --git a/test/test.mk b/test/test.mk
index 5ccb5f2..2d18f69 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -7,6 +7,8 @@
 LIBAOM_TEST_SRCS-yes += test_libaom.cc
 LIBAOM_TEST_SRCS-yes += util.h
 LIBAOM_TEST_SRCS-yes += video_source.h
+LIBAOM_TEST_SRCS-yes += transform_test_base.h
+LIBAOM_TEST_SRCS-yes += function_equivalence_test.h
 
 ##
 ## BLACK BOX TESTS
@@ -16,23 +18,24 @@
 LIBAOM_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBAOM_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
+LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
-##TODO(jimbankoski): Figure out why resize is failing.
-##LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
+#LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += realtime_test.cc
+#LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += resize_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += y4m_video_source.h
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 
+#LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += level_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += active_map_refresh_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += active_map_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += borders_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += cpu_speed_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += frame_size_tests.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += lossless_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += end_to_end_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += ethread_test.cc
 
 LIBAOM_TEST_SRCS-yes                   += decode_test_driver.cc
@@ -73,6 +76,13 @@
 LIBAOM_TEST_SRCS-yes += encode_perf_test.cc
 endif
 
+## Multi-codec / unconditional black box tests.
+ifeq ($(findstring yes,$(CONFIG_AV1_ENCODER)),yes)
+LIBAOM_TEST_SRCS-yes += active_map_refresh_test.cc
+LIBAOM_TEST_SRCS-yes += active_map_test.cc
+LIBAOM_TEST_SRCS-yes += end_to_end_test.cc
+endif
+
 ##
 ## WHITE BOX TESTS
 ##
@@ -100,52 +110,91 @@
 endif
 endif
 LIBAOM_TEST_SRCS-yes                   += divu_small_test.cc
-LIBAOM_TEST_SRCS-yes                   += encoder_parms_get_to_decoder.cc
+#LIBAOM_TEST_SRCS-yes                   += encoder_parms_get_to_decoder.cc
 endif
 
 LIBAOM_TEST_SRCS-$(CONFIG_ADAPT_SCAN)  += scan_test.cc
-LIBAOM_TEST_SRCS-yes                   += convolve_test.cc
-LIBAOM_TEST_SRCS-yes                   += convolve_test.cc
-LIBAOM_TEST_SRCS-yes                   += av1_convolve_test.cc
+#LIBAOM_TEST_SRCS-yes                   += convolve_test.cc
 LIBAOM_TEST_SRCS-yes                   += lpf_8_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_CLPF)        += clpf_test.cc
 LIBAOM_TEST_SRCS-yes                   += intrapred_test.cc
+#LIBAOM_TEST_SRCS-$(CONFIG_AV1_DECODER) += av1_thread_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += dct16x16_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += dct32x32_test.cc
-LIBAOM_TEST_SRCS-yes                   += convolve_test.cc
-LIBAOM_TEST_SRCS-yes                   += av1_convolve_optimz_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fdct4x4_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fdct8x8_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += hadamard_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += minmax_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += variance_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += quantize_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += error_block_test.cc
+#LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_quantize_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += subtract_test.cc
-LIBAOM_TEST_SRCS-yes += function_equivalence_test.h
-LIBAOM_TEST_SRCS-yes += blend_a64_mask_test.cc
-LIBAOM_TEST_SRCS-yes += blend_a64_mask_1d_test.cc
+
+ifeq ($(CONFIG_AV1_ENCODER)$(CONFIG_AV1_TEMPORAL_DENOISING),yesyes)
+#LIBAOM_TEST_SRCS-$(HAVE_SSE2) += denoiser_sse2_test.cc
+endif
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += arf_freq_test.cc
+
+
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_inv_txfm_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_dct_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht4x4_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x8_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x16_test.cc
+ifeq ($(CONFIG_EXT_TX),yes)
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht4x8_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x4_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht8x16_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x8_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht16x32_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_fht32x16_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += fht32x32_test.cc
+endif
+LIBAOM_TEST_SRCS-$(CONFIG_EXT_TILE)     += av1_ext_tile_test.cc
+
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += sum_squares_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += subtract_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += blend_a64_mask_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += blend_a64_mask_1d_test.cc
+
+ifeq ($(CONFIG_EXT_INTER),yes)
+LIBAOM_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
+LIBAOM_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_wedge_utils_test.cc
+endif
+
+ifeq ($(CONFIG_FILTER_INTRA),yes)
+LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += filterintra_predictors_test.cc
+endif
+
 ifeq ($(CONFIG_MOTION_VAR),yes)
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += obmc_sad_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += obmc_variance_test.cc
 endif
 
-ifeq ($(CONFIG_AV1_ENCODER)$(CONFIG_AV1_TEMPORAL_DENOISING),yesyes)
-LIBAOM_TEST_SRCS-$(HAVE_SSE2) += denoiser_sse2_test.cc
-endif
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += arf_freq_test.cc
-
-LIBAOM_TEST_SRCS-yes                    += av1_inv_txfm_test.cc
-LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_dct_test.cc
-
+ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
+LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_quantize_test.cc
+LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += av1_highbd_iht_test.cc
+endif # CONFIG_AOM_HIGHBITDEPTH
 endif # AV1
 
 ## Multi-codec / unconditional whitebox tests.
 
-ifeq ($(findstring yes,$(CONFIG_AV1_ENCODER)$(CONFIG_AV1_ENCODER)),yes)
+ifeq ($(CONFIG_AV1_ENCODER),yes)
 LIBAOM_TEST_SRCS-yes += avg_test.cc
 endif
 ifeq ($(CONFIG_INTERNAL_STATS),yes)
 LIBAOM_TEST_SRCS-$(CONFIG_AOM_HIGHBITDEPTH) += hbd_metrics_test.cc
 endif
 LIBAOM_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_txfm_test.h
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_txfm_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_fwd_txfm1d_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_inv_txfm1d_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_fwd_txfm2d_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_inv_txfm2d_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_convolve_test.cc
+LIBAOM_TEST_SRCS-$(CONFIG_AV1) += av1_convolve_optimz_test.cc
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c

diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 697c28f..f98a730 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc

@@ -47,12 +47,12 @@
   const int kTotalPixels = 32 * kBPS;
   DECLARE_ALIGNED(16, uint8_t, src[kTotalPixels]);
   DECLARE_ALIGNED(16, uint8_t, ref_src[kTotalPixels]);
-  DECLARE_ALIGNED(16, uint8_t, left[kBPS * 2]);
+  DECLARE_ALIGNED(16, uint8_t, left[kBPS]);
   DECLARE_ALIGNED(16, uint8_t, above_mem[2 * kBPS + 16]);
   uint8_t *const above = above_mem + 16;
   for (int i = 0; i < kTotalPixels; ++i) ref_src[i] = rnd.Rand8();
-  for (int i = 0; i < kBPS * 2; ++i) left[i] = rnd.Rand8();
-  for (int i = -1; i < kBPS * 2; ++i) above[i] = rnd.Rand8();
+  for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand8();
+  for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand8();
   const int kNumTests = static_cast<int>(2.e10 / num_pixels_per_test);
 
   // some code assumes the top row has been extended:
@@ -157,120 +157,195 @@
 // -----------------------------------------------------------------------------
 // 4x4
 
+#if CONFIG_ALT_INTRA
+#define tm_pred_func aom_paeth_predictor_4x4_c
+#else
+#define tm_pred_func aom_tm_predictor_4x4_c
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(C, TestIntraPred4, aom_dc_predictor_4x4_c,
                 aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
                 aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
-                aom_h_predictor_4x4_c, aom_d45e_predictor_4x4_c,
+                aom_h_predictor_4x4_c, aom_d45_predictor_4x4_c,
                 aom_d135_predictor_4x4_c, aom_d117_predictor_4x4_c,
-                aom_d153_predictor_4x4_c, aom_d207e_predictor_4x4_c,
-                aom_d63e_predictor_4x4_c, aom_tm_predictor_4x4_c)
+                aom_d153_predictor_4x4_c, aom_d207_predictor_4x4_c,
+                aom_d63_predictor_4x4_c, tm_pred_func)
+#undef tm_pred_func
 
 #if HAVE_SSE2
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_4x4_sse2
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(SSE2, TestIntraPred4, aom_dc_predictor_4x4_sse2,
                 aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
                 aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
-                aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_4x4_sse2)
+                aom_h_predictor_4x4_sse2, aom_d45_predictor_4x4_sse2, NULL,
+                NULL, NULL, aom_d207_predictor_4x4_sse2, NULL, tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred4, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, aom_d153_predictor_4x4_ssse3, NULL, NULL, NULL)
+                NULL, NULL, aom_d153_predictor_4x4_ssse3, NULL,
+                aom_d63_predictor_4x4_ssse3, NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_4x4_dspr2
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(DSPR2, TestIntraPred4, aom_dc_predictor_4x4_dspr2, NULL, NULL,
                 NULL, NULL, aom_h_predictor_4x4_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, aom_tm_predictor_4x4_dspr2)
+                NULL, NULL, tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_4x4_neon
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(NEON, TestIntraPred4, aom_dc_predictor_4x4_neon,
                 aom_dc_left_predictor_4x4_neon, aom_dc_top_predictor_4x4_neon,
                 aom_dc_128_predictor_4x4_neon, aom_v_predictor_4x4_neon,
-                aom_h_predictor_4x4_neon, NULL, aom_d135_predictor_4x4_neon,
-                NULL, NULL, NULL, NULL, aom_tm_predictor_4x4_neon)
+                aom_h_predictor_4x4_neon, aom_d45_predictor_4x4_neon,
+                aom_d135_predictor_4x4_neon, NULL, NULL, NULL, NULL,
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_4x4_msa
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(MSA, TestIntraPred4, aom_dc_predictor_4x4_msa,
                 aom_dc_left_predictor_4x4_msa, aom_dc_top_predictor_4x4_msa,
                 aom_dc_128_predictor_4x4_msa, aom_v_predictor_4x4_msa,
                 aom_h_predictor_4x4_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_4x4_msa)
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
 // 8x8
 
+#if CONFIG_ALT_INTRA
+#define tm_pred_func aom_paeth_predictor_8x8_c
+#else
+#define tm_pred_func aom_tm_predictor_8x8_c
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(C, TestIntraPred8, aom_dc_predictor_8x8_c,
                 aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
                 aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
-                aom_h_predictor_8x8_c, aom_d45e_predictor_8x8_c,
+                aom_h_predictor_8x8_c, aom_d45_predictor_8x8_c,
                 aom_d135_predictor_8x8_c, aom_d117_predictor_8x8_c,
-                aom_d153_predictor_8x8_c, aom_d207e_predictor_8x8_c,
-                aom_d63e_predictor_8x8_c, aom_tm_predictor_8x8_c)
+                aom_d153_predictor_8x8_c, aom_d207_predictor_8x8_c,
+                aom_d63_predictor_8x8_c, tm_pred_func)
+#undef tm_pred_func
 
 #if HAVE_SSE2
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_8x8_sse2
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(SSE2, TestIntraPred8, aom_dc_predictor_8x8_sse2,
                 aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
                 aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
-                aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_8x8_sse2)
+                aom_h_predictor_8x8_sse2, aom_d45_predictor_8x8_sse2, NULL,
+                NULL, NULL, NULL, NULL, tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred8, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, aom_d153_predictor_8x8_ssse3, NULL, NULL, NULL)
+                NULL, NULL, aom_d153_predictor_8x8_ssse3,
+                aom_d207_predictor_8x8_ssse3, aom_d63_predictor_8x8_ssse3, NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_8x8_dspr2
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(DSPR2, TestIntraPred8, aom_dc_predictor_8x8_dspr2, NULL, NULL,
                 NULL, NULL, aom_h_predictor_8x8_dspr2, NULL, NULL, NULL, NULL,
-                NULL, NULL, aom_tm_predictor_8x8_c)
+                NULL, NULL, tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_8x8_neon
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(NEON, TestIntraPred8, aom_dc_predictor_8x8_neon,
                 aom_dc_left_predictor_8x8_neon, aom_dc_top_predictor_8x8_neon,
                 aom_dc_128_predictor_8x8_neon, aom_v_predictor_8x8_neon,
-                aom_h_predictor_8x8_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_8x8_neon)
-
+                aom_h_predictor_8x8_neon, aom_d45_predictor_8x8_neon, NULL,
+                NULL, NULL, NULL, NULL, tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_8x8_msa
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(MSA, TestIntraPred8, aom_dc_predictor_8x8_msa,
                 aom_dc_left_predictor_8x8_msa, aom_dc_top_predictor_8x8_msa,
                 aom_dc_128_predictor_8x8_msa, aom_v_predictor_8x8_msa,
                 aom_h_predictor_8x8_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_8x8_msa)
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
 // 16x16
 
+#if CONFIG_ALT_INTRA
+#define tm_pred_func aom_paeth_predictor_16x16_c
+#else
+#define tm_pred_func aom_tm_predictor_16x16_c
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(C, TestIntraPred16, aom_dc_predictor_16x16_c,
                 aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
                 aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
-                aom_h_predictor_16x16_c, aom_d45e_predictor_16x16_c,
+                aom_h_predictor_16x16_c, aom_d45_predictor_16x16_c,
                 aom_d135_predictor_16x16_c, aom_d117_predictor_16x16_c,
-                aom_d153_predictor_16x16_c, aom_d207e_predictor_16x16_c,
-                aom_d63e_predictor_16x16_c, aom_tm_predictor_16x16_c)
+                aom_d153_predictor_16x16_c, aom_d207_predictor_16x16_c,
+                aom_d63_predictor_16x16_c, tm_pred_func)
+#undef tm_pred_func
 
 #if HAVE_SSE2
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_16x16_sse2
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(SSE2, TestIntraPred16, aom_dc_predictor_16x16_sse2,
                 aom_dc_left_predictor_16x16_sse2,
                 aom_dc_top_predictor_16x16_sse2,
                 aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
                 aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_16x16_sse2)
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred16, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, aom_d153_predictor_16x16_ssse3, NULL, NULL,
-                NULL)
+                aom_d45_predictor_16x16_ssse3, NULL, NULL,
+                aom_d153_predictor_16x16_ssse3, aom_d207_predictor_16x16_ssse3,
+                aom_d63_predictor_16x16_ssse3, NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_DSPR2
@@ -280,63 +355,100 @@
 #endif  // HAVE_DSPR2
 
 #if HAVE_NEON
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_16x16_neon
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(NEON, TestIntraPred16, aom_dc_predictor_16x16_neon,
                 aom_dc_left_predictor_16x16_neon,
                 aom_dc_top_predictor_16x16_neon,
                 aom_dc_128_predictor_16x16_neon, aom_v_predictor_16x16_neon,
-                aom_h_predictor_16x16_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_16x16_neon)
+                aom_h_predictor_16x16_neon, aom_d45_predictor_16x16_neon, NULL,
+                NULL, NULL, NULL, NULL, tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_16x16_msa
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(MSA, TestIntraPred16, aom_dc_predictor_16x16_msa,
                 aom_dc_left_predictor_16x16_msa, aom_dc_top_predictor_16x16_msa,
                 aom_dc_128_predictor_16x16_msa, aom_v_predictor_16x16_msa,
                 aom_h_predictor_16x16_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_16x16_msa)
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_MSA
 
 // -----------------------------------------------------------------------------
 // 32x32
 
+#if CONFIG_ALT_INTRA
+#define tm_pred_func aom_paeth_predictor_32x32_c
+#else
+#define tm_pred_func aom_tm_predictor_32x32_c
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(C, TestIntraPred32, aom_dc_predictor_32x32_c,
                 aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
                 aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
-                aom_h_predictor_32x32_c, aom_d45e_predictor_32x32_c,
+                aom_h_predictor_32x32_c, aom_d45_predictor_32x32_c,
                 aom_d135_predictor_32x32_c, aom_d117_predictor_32x32_c,
-                aom_d153_predictor_32x32_c, aom_d207e_predictor_32x32_c,
-                aom_d63e_predictor_32x32_c, aom_tm_predictor_32x32_c)
+                aom_d153_predictor_32x32_c, aom_d207_predictor_32x32_c,
+                aom_d63_predictor_32x32_c, tm_pred_func)
+#undef tm_pred_func
 
 #if HAVE_SSE2
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_32x32_sse2
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(SSE2, TestIntraPred32, aom_dc_predictor_32x32_sse2,
                 aom_dc_left_predictor_32x32_sse2,
                 aom_dc_top_predictor_32x32_sse2,
                 aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
                 aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_32x32_sse2)
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3, TestIntraPred32, NULL, NULL, NULL, NULL, NULL, NULL,
-                NULL, NULL, NULL, aom_d153_predictor_32x32_ssse3, NULL, NULL,
-                NULL)
+                aom_d45_predictor_32x32_ssse3, NULL, NULL,
+                aom_d153_predictor_32x32_ssse3, aom_d207_predictor_32x32_ssse3,
+                aom_d63_predictor_32x32_ssse3, NULL)
 #endif  // HAVE_SSSE3
 
 #if HAVE_NEON
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_32x32_neon
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(NEON, TestIntraPred32, aom_dc_predictor_32x32_neon,
                 aom_dc_left_predictor_32x32_neon,
                 aom_dc_top_predictor_32x32_neon,
                 aom_dc_128_predictor_32x32_neon, aom_v_predictor_32x32_neon,
                 aom_h_predictor_32x32_neon, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_32x32_neon)
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
+#if CONFIG_ALT_INTRA
+#define tm_pred_func NULL
+#else
+#define tm_pred_func aom_tm_predictor_32x32_msa
+#endif  // CONFIG_ALT_INTRA
 INTRA_PRED_TEST(MSA, TestIntraPred32, aom_dc_predictor_32x32_msa,
                 aom_dc_left_predictor_32x32_msa, aom_dc_top_predictor_32x32_msa,
                 aom_dc_128_predictor_32x32_msa, aom_v_predictor_32x32_msa,
                 aom_h_predictor_32x32_msa, NULL, NULL, NULL, NULL, NULL, NULL,
-                aom_tm_predictor_32x32_msa)
+                tm_pred_func)
+#undef tm_pred_func
 #endif  // HAVE_MSA
 
 #include "test/test_libaom.cc"

diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 0c14d65..57f4a60 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc

@@ -21,12 +21,13 @@
 #include "aom_mem/aom_mem.h"
 
 namespace {
-class TileIndependenceTest : public ::libaom_test::EncoderTest,
-                             public ::libaom_test::CodecTestWithParam<int> {
+class TileIndependenceTest
+    : public ::libaom_test::EncoderTest,
+      public ::libaom_test::CodecTestWith2Params<int, int> {
  protected:
   TileIndependenceTest()
       : EncoderTest(GET_PARAM(0)), md5_fw_order_(), md5_inv_order_(),
-        n_tiles_(GET_PARAM(1)) {
+        n_tile_cols_(GET_PARAM(1)), n_tile_rows_(GET_PARAM(2)) {
     init_flags_ = AOM_CODEC_USE_PSNR;
     aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
     cfg.w = 704;
@@ -35,6 +36,15 @@
     fw_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_->Control(AV1_INVERT_TILE_DECODE_ORDER, 1);
+
+#if CONFIG_AV1 && CONFIG_EXT_TILE
+    if (fw_dec_->IsAV1() && inv_dec_->IsAV1()) {
+      fw_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      fw_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+      inv_dec_->Control(AV1_SET_DECODE_TILE_ROW, -1);
+      inv_dec_->Control(AV1_SET_DECODE_TILE_COL, -1);
+    }
+#endif
   }
 
   virtual ~TileIndependenceTest() {
@@ -50,10 +60,17 @@
   virtual void PreEncodeFrameHook(libaom_test::VideoSource *video,
                                   libaom_test::Encoder *encoder) {
     if (video->frame() == 1) {
-      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tiles_);
+      encoder->Control(AV1E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(AV1E_SET_TILE_ROWS, n_tile_rows_);
+      SetCpuUsed(encoder);
     }
   }
 
+  virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+    static const int kCpuUsed = 3;
+    encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+  }
+
   void UpdateMD5(::libaom_test::Decoder *dec, const aom_codec_cx_pkt_t *pkt,
                  ::libaom_test::MD5 *md5) {
     const aom_codec_err_t res = dec->DecodeFrame(
@@ -71,45 +88,58 @@
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
 
+  void DoTest() {
+    const aom_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_lag_in_frames = 12;
+    cfg_.rc_end_usage = AOM_VBR;
+
+    libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+                                       timebase.den, timebase.num, 0, 5);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    const char *md5_fw_str = md5_fw_order_.Get();
+    const char *md5_inv_str = md5_inv_order_.Get();
+    ASSERT_STREQ(md5_fw_str, md5_inv_str);
+  }
+
   ::libaom_test::MD5 md5_fw_order_, md5_inv_order_;
   ::libaom_test::Decoder *fw_dec_, *inv_dec_;
 
  private:
-  int n_tiles_;
+  int n_tile_cols_;
+  int n_tile_rows_;
 };
 
 // run an encode with 2 or 4 tiles, and do the decode both in normal and
 // inverted tile ordering. Ensure that the MD5 of the output in both cases
 // is identical. If so, tiles are considered independent and the test passes.
-TEST_P(TileIndependenceTest, MD5Match) {
-  const aom_rational timebase = { 33333333, 1000000000 };
-  cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_end_usage = AOM_VBR;
+TEST_P(TileIndependenceTest, MD5Match) { DoTest(); }
 
-  libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 144,
-                                     timebase.den, timebase.num, 0, 30);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+class TileIndependenceTestLarge : public TileIndependenceTest {
+  virtual void SetCpuUsed(libaom_test::Encoder *encoder) {
+    static const int kCpuUsed = 0;
+    encoder->Control(AOME_SET_CPUUSED, kCpuUsed);
+  }
+};
 
-  const char *md5_fw_str = md5_fw_order_.Get();
-  const char *md5_inv_str = md5_inv_order_.Get();
+TEST_P(TileIndependenceTestLarge, MD5Match) { DoTest(); }
 
-  // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
-  // output if it fails. Not sure if it's helpful since it's really just
-  // a MD5...
-  ASSERT_STREQ(md5_fw_str, md5_inv_str);
-}
 #if CONFIG_EC_ADAPT
 // TODO(thdavies): EC_ADAPT does not support tiles
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_AV1, TileIndependenceTest,
-    ::testing::Combine(
-        ::testing::Values(
-            static_cast<const libaom_test::CodecFactory *>(&libaom_test::kAV1)),
-        ::testing::Range(0, 2, 1)));
 #else
-AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
-#endif
-
+#if CONFIG_EXT_TILE
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(1, 2, 32),
+                          ::testing::Values(1, 2, 32));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge,
+                          ::testing::Values(1, 2, 32),
+                          ::testing::Values(1, 2, 32));
+#else
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
+                          ::testing::Values(0, 1));
+AV1_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
+                          ::testing::Values(0, 1));
+#endif  // CONFIG_EXT_TILE
+#endif  // CONFIG_EC_ADAPT
 }  // namespace

diff --git a/test/tools_common.sh b/test/tools_common.sh
index 11d9082..254e6b2 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh

@@ -183,6 +183,8 @@
 av1_encode_available() {
   [ "$(aom_config_option_enabled CONFIG_AV1_ENCODER)" = "yes" ] && echo yes
 }
+
+# Echoes yes to stdout when aom_config_option_enabled() reports yes for
 # CONFIG_WEBM_IO.
 webm_io_available() {
   [ "$(aom_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
@@ -411,12 +413,12 @@
 fi
 
 # Variables shared by tests.
-AOM_IVF_FILE="${LIBAOM_TEST_DATA_PATH}/aom0-00-comprehensive-001.ivf"
-AV1_IVF_FILE="${LIBAOM_TEST_DATA_PATH}/av10-2-09-subpixel-00.ivf"
+VP8_IVF_FILE="${LIBAOM_TEST_DATA_PATH}/vp80-00-comprehensive-001.ivf"
+AV1_IVF_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-09-subpixel-00.ivf"
 
-AV1_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/av10-2-00-quantizer-00.webm"
-AV1_FPM_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/av10-2-07-frame_parallel-1.webm"
-AV1_LT_50_FRAMES_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/av10-2-02-size-32x08.webm"
+AV1_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
+AV1_FPM_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
+AV1_LT_50_FRAMES_WEBM_FILE="${LIBAOM_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
 
 YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
 YUV_RAW_INPUT_WIDTH=352

diff --git a/test/transform_test_base.h b/test/transform_test_base.h
new file mode 100644
index 0000000..64bf2d6
--- /dev/null
+++ b/test/transform_test_base.h

@@ -0,0 +1,360 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_TRANSFORM_TEST_BASE_H_
+#define TEST_TRANSFORM_TEST_BASE_H_
+
+#include "./aom_config.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_codec.h"
+
+namespace libaom_test {
+
+//  Note:
+//   Same constant are defined in av1/common/av1_entropy.h and
+//   av1/common/entropy.h.  Goal is to make this base class
+//   to use for future codec transform testing.  But including
+//   either of them would lead to compiling error when we do
+//   unit test for another codec. Suggest to move the definition
+//   to a aom header file.
+const int kDctMaxValue = 16384;
+
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+class TransformTestBase {
+ public:
+  virtual ~TransformTestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+
+    int16_t *test_input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *test_temp_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+#if CONFIG_AOM_HIGHBITDEPTH
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == AOM_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_AOM_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
+      if (bit_depth_ == AOM_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_AOM_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+#if CONFIG_AOM_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        ASSERT_EQ(AOM_BITS_8, bit_depth_);
+        const int diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        if (max_error < error) max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << " per block";
+
+    aom_free(test_input_block);
+    aom_free(test_temp_block);
+    aom_free(dst);
+    aom_free(src);
+#if CONFIG_AOM_HIGHBITDEPTH
+    aom_free(dst16);
+    aom_free(src16);
+#endif
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    // Use a stride value which is not the width of any transform, to catch
+    // cases where the transforms use the stride incorrectly.
+    int stride = 96;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * stride * height_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      int j, k;
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int in_idx = j * stride + k;
+          int out_idx = j * pitch_ + k;
+          input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+          if (bit_depth_ == AOM_BITS_8) {
+            output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+#if CONFIG_AOM_HIGHBITDEPTH
+          } else {
+            output_block[out_idx] = output_ref_block[out_idx] =
+                rnd.Rand16() & mask_;
+#endif
+          }
+        }
+      }
+
+      fwd_txfm_ref(input_block, output_ref_block, stride, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, stride));
+
+      // The minimum quant value is 4.
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int out_idx = j * pitch_ + k;
+          ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+              << "Error: not bit-exact result at index: " << out_idx
+              << " at test block: " << i;
+        }
+      }
+    }
+    aom_free(input_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunInvCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    // Use a stride value which is not the width of any transform, to catch
+    // cases where the transforms use the stride incorrectly.
+    int stride = 96;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *trans_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *output_block = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * stride * height_));
+    uint8_t *output_ref_block = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * stride * height_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      int j, k;
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int in_idx = j * pitch_ + k;
+          int out_idx = j * stride + k;
+          input_block[in_idx] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+          output_ref_block[out_idx] = rnd.Rand16() & mask_;
+          output_block[out_idx] = output_ref_block[out_idx];
+        }
+      }
+
+      fwd_txfm_ref(input_block, trans_block, pitch_, tx_type_);
+
+      inv_txfm_ref(trans_block, output_ref_block, stride, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(trans_block, output_block, stride));
+
+      for (j = 0; j < height_; ++j) {
+        for (k = 0; k < pitch_; ++k) {
+          int out_idx = j * stride + k;
+          ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+              << "Error: not bit-exact result at index: " << out_idx
+              << " j = " << j << " k = " << k << " at test block: " << i;
+        }
+      }
+    }
+    aom_free(input_block);
+    aom_free(trans_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    int16_t *input_extreme_block = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0) {
+        for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = mask_;
+      } else if (i == 1) {
+        for (int j = 0; j < num_coeffs_; ++j) input_extreme_block[j] = -mask_;
+      }
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(
+          RunFwdTxfm(input_extreme_block, output_block, pitch_));
+
+      int row_length = FindRowLength();
+      // The minimum quant value is 4.
+      for (int j = 0; j < num_coeffs_; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
+                  abs(output_block[j]))
+            << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE";
+      }
+    }
+    aom_free(input_extreme_block);
+    aom_free(output_ref_block);
+    aom_free(output_block);
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+
+    int16_t *in = reinterpret_cast<int16_t *>(
+        aom_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *coeff = reinterpret_cast<tran_low_t *>(
+        aom_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>(
+        aom_memalign(16, sizeof(uint8_t) * num_coeffs_));
+
+#if CONFIG_AOM_HIGHBITDEPTH
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(
+        aom_memalign(16, sizeof(uint16_t) * num_coeffs_));
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == AOM_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_AOM_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
+
+      if (bit_depth_ == AOM_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_AOM_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+#if CONFIG_AOM_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == AOM_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const int diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(static_cast<uint32_t>(limit), error)
+            << "Error: 4x4 IDCT has error " << error << " at index " << j;
+      }
+    }
+    aom_free(in);
+    aom_free(coeff);
+    aom_free(dst);
+    aom_free(src);
+#if CONFIG_AOM_HIGHBITDEPTH
+    aom_free(src16);
+    aom_free(dst16);
+#endif
+  }
+
+  int pitch_;
+  int height_;
+  int tx_type_;
+  FhtFunc fwd_txfm_ref;
+  IhtFunc inv_txfm_ref;
+  aom_bit_depth_t bit_depth_;
+  int mask_;
+  int num_coeffs_;
+
+ private:
+  //  Assume transform size is 4x4, 8x8, 16x16,...
+  int FindRowLength() const {
+    int row = 4;
+    if (16 == num_coeffs_) {
+      row = 4;
+    } else if (64 == num_coeffs_) {
+      row = 8;
+    } else if (256 == num_coeffs_) {
+      row = 16;
+    } else if (1024 == num_coeffs_) {
+      row = 32;
+    }
+    return row;
+  }
+};
+
+}  // namespace libaom_test
+
+#endif  // TEST_TRANSFORM_TEST_BASE_H_

diff --git a/test/twopass_encoder.sh b/test/twopass_encoder.sh
index 259b0f4..3abb762 100755
--- a/test/twopass_encoder.sh
+++ b/test/twopass_encoder.sh

@@ -23,7 +23,8 @@
   fi
 }
 
-# Runs twopass_encoder using the codec specified by $1.
+# Runs twopass_encoder using the codec specified by $1 with a frame limit of
+# 100.
 twopass_encoder() {
   local encoder="${LIBAOM_BIN_PATH}/twopass_encoder${AOM_TEST_EXE_SUFFIX}"
   local codec="$1"

diff --git a/test/user_priv_test.cc b/test/user_priv_test.cc
new file mode 100644
index 0000000..54d4ee6
--- /dev/null
+++ b/test/user_priv_test.cc

@@ -0,0 +1,100 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "./aom_config.h"
+#include "test/acm_random.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#if CONFIG_WEBM_IO
+#include "test/webm_video_source.h"
+#endif
+#include "aom_mem/aom_mem.h"
+#include "aom/aom.h"
+
+namespace {
+
+using std::string;
+using libaom_test::ACMRandom;
+
+#if CONFIG_WEBM_IO
+
+void CheckUserPrivateData(void *user_priv, int *target) {
+  // actual pointer value should be the same as expected.
+  EXPECT_EQ(reinterpret_cast<void *>(target), user_priv)
+      << "user_priv pointer value does not match.";
+}
+
+// Decodes |filename|. Passes in user_priv data when calling DecodeFrame and
+// compares the user_priv from return img with the original user_priv to see if
+// they match. Both the pointer values and the values inside the addresses
+// should match.
+string DecodeFile(const string &filename) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  libaom_test::WebMVideoSource video(filename);
+  video.Init();
+
+  aom_codec_dec_cfg_t cfg = aom_codec_dec_cfg_t();
+  libaom_test::AV1Decoder decoder(cfg, 0);
+
+  libaom_test::MD5 md5;
+  int frame_num = 0;
+  for (video.Begin(); !::testing::Test::HasFailure() && video.cxdata();
+       video.Next()) {
+    void *user_priv = reinterpret_cast<void *>(&frame_num);
+    const aom_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size(),
+                            (frame_num == 0) ? NULL : user_priv);
+    if (res != AOM_CODEC_OK) {
+      EXPECT_EQ(AOM_CODEC_OK, res) << decoder.DecodeError();
+      break;
+    }
+    libaom_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const aom_image_t *img = NULL;
+
+    // Get decompressed data.
+    while ((img = dec_iter.Next())) {
+      if (frame_num == 0) {
+        CheckUserPrivateData(img->user_priv, NULL);
+      } else {
+        CheckUserPrivateData(img->user_priv, &frame_num);
+
+        // Also test ctrl_get_reference api.
+        struct av1_ref_frame ref;
+        // Randomly fetch a reference frame.
+        ref.idx = rnd.Rand8() % 3;
+        decoder.Control(AV1_GET_REFERENCE, &ref);
+
+        CheckUserPrivateData(ref.img.user_priv, NULL);
+      }
+      md5.Add(img);
+    }
+
+    frame_num++;
+  }
+  return string(md5.Get());
+}
+
+TEST(UserPrivTest, VideoDecode) {
+  // no tiles or frame parallel; this exercises the decoding to test the
+  // user_priv.
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc",
+               DecodeFile("av10-2-03-size-226x226.webm").c_str());
+}
+
+#endif  // CONFIG_WEBM_IO
+
+}  // namespace

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 15231da..7848e20 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -42,9 +42,6 @@
                                       const uint8_t *b, int b_stride);
 typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
 
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
 using libaom_test::ACMRandom;
 
 // Truncate high bit depth results by downshifting (with rounding) by:
@@ -73,10 +70,13 @@
   return res;
 }
 
+/* Note:
+ *  Our codebase calculates the "diff" value in the variance algorithm by
+ *  (src - ref).
+ */
 static uint32_t variance_ref(const uint8_t *src, const uint8_t *ref, int l2w,
-                             int l2h, int src_stride_coeff,
-                             int ref_stride_coeff, uint32_t *sse_ptr,
-                             bool use_high_bit_depth_,
+                             int l2h, int src_stride, int ref_stride,
+                             uint32_t *sse_ptr, bool use_high_bit_depth_,
                              aom_bit_depth_t bit_depth) {
   int64_t se = 0;
   uint64_t sse = 0;
@@ -86,14 +86,13 @@
     for (int x = 0; x < w; x++) {
       int diff;
       if (!use_high_bit_depth_) {
-        diff = ref[w * y * ref_stride_coeff + x] -
-               src[w * y * src_stride_coeff + x];
+        diff = src[y * src_stride + x] - ref[y * ref_stride + x];
         se += diff;
         sse += diff * diff;
 #if CONFIG_AOM_HIGHBITDEPTH
       } else {
-        diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
-               CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
+        diff = CONVERT_TO_SHORTPTR(src)[y * src_stride + x] -
+               CONVERT_TO_SHORTPTR(ref)[y * ref_stride + x];
         se += diff;
         sse += diff * diff;
 #endif  // CONFIG_AOM_HIGHBITDEPTH
@@ -109,7 +108,7 @@
  * they calculate the bilinear factors directly instead of using a lookup table
  * and therefore upshift xoff and yoff by 1. Only every other calculated value
  * is used so the codec version shrinks the table to save space and maintain
- * compatibility with aom.
+ * compatibility with vp8.
  */
 static uint32_t subpel_variance_ref(const uint8_t *ref, const uint8_t *src,
                                     int l2w, int l2h, int xoff, int yoff,
@@ -160,6 +159,61 @@
   return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
 }
 
+static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
+                                        const uint8_t *second_pred, int l2w,
+                                        int l2h, int xoff, int yoff,
+                                        uint32_t *sse_ptr,
+                                        bool use_high_bit_depth,
+                                        aom_bit_depth_t bit_depth) {
+  int64_t se = 0;
+  uint64_t sse = 0;
+  const int w = 1 << l2w;
+  const int h = 1 << l2h;
+
+  xoff <<= 1;
+  yoff <<= 1;
+
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      // bilinear interpolation at a 16th pel step
+      if (!use_high_bit_depth) {
+        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff =
+            ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#if CONFIG_AOM_HIGHBITDEPTH
+      } else {
+        const uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
+        const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+        const uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
+        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
+        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
+        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
+        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
+        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
+        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
+        const int r = a + (((b - a) * yoff + 8) >> 4);
+        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
+        se += diff;
+        sse += diff * diff;
+#endif  // CONFIG_AOM_HIGHBITDEPTH
+      }
+    }
+  }
+  RoundHighBitDepth(bit_depth, &se, &sse);
+  *sse_ptr = static_cast<uint32_t>(sse);
+  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
 class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
@@ -200,329 +254,282 @@
   }
 }
 
-template <typename VarianceFunctionType>
-class VarianceTest : public ::testing::TestWithParam<
-                         tuple<int, int, VarianceFunctionType, int> > {
+////////////////////////////////////////////////////////////////////////////////
+// Encapsulating struct to store the function to test along with
+// some testing context.
+// Can be used for MSE, SSE, Variance, etc.
+
+template <typename Func>
+struct TestParams {
+  TestParams(int log2w = 0, int log2h = 0, Func function = NULL,
+             int bit_depth_value = 0)
+      : log2width(log2w), log2height(log2h), func(function) {
+    use_high_bit_depth = (bit_depth_value > 0);
+    if (use_high_bit_depth) {
+      bit_depth = static_cast<aom_bit_depth_t>(bit_depth_value);
+    } else {
+      bit_depth = AOM_BITS_8;
+    }
+    width = 1 << log2width;
+    height = 1 << log2height;
+    block_size = width * height;
+    mask = (1u << bit_depth) - 1;
+  }
+
+  int log2width, log2height;
+  int width, height;
+  int block_size;
+  Func func;
+  aom_bit_depth_t bit_depth;
+  bool use_high_bit_depth;
+  uint32_t mask;
+};
+
+template <typename Func>
+std::ostream &operator<<(std::ostream &os, const TestParams<Func> &p) {
+  return os << "log2width/height:" << p.log2width << "/" << p.log2height
+            << " function:" << reinterpret_cast<const void *>(p.func)
+            << " bit-depth:" << p.bit_depth;
+}
+
+// Main class for testing a function type
+template <typename FunctionType>
+class MainTestClass
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, VarianceFunctionType, int> &params = this->GetParam();
-    log2width_ = get<0>(params);
-    width_ = 1 << log2width_;
-    log2height_ = get<1>(params);
-    height_ = 1 << log2height_;
-    variance_ = get<2>(params);
-    if (get<3>(params)) {
-      bit_depth_ = static_cast<aom_bit_depth_t>(get<3>(params));
-      use_high_bit_depth_ = true;
-    } else {
-      bit_depth_ = AOM_BITS_8;
-      use_high_bit_depth_ = false;
-    }
-    mask_ = (1 << bit_depth_) - 1;
+    params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    block_size_ = width_ * height_;
-    if (!use_high_bit_depth_) {
-      src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size_ * 2));
-      ref_ = new uint8_t[block_size_ * 2];
-#if CONFIG_AOM_HIGHBITDEPTH
-    } else {
-      src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          aom_memalign(16, block_size_ * 2 * sizeof(uint16_t))));
-      ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-    }
+    const size_t unit =
+        use_high_bit_depth() ? sizeof(uint16_t) : sizeof(uint8_t);
+    src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size() * unit));
+    ref_ = new uint8_t[block_size() * unit];
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
+#if CONFIG_AOM_HIGHBITDEPTH
+    if (use_high_bit_depth()) {
+      // TODO(skal): remove!
+      src_ = CONVERT_TO_BYTEPTR(src_);
+      ref_ = CONVERT_TO_BYTEPTR(ref_);
+    }
+#endif
   }
 
   virtual void TearDown() {
-    if (!use_high_bit_depth_) {
-      aom_free(src_);
-      delete[] ref_;
 #if CONFIG_AOM_HIGHBITDEPTH
-    } else {
-      aom_free(CONVERT_TO_SHORTPTR(src_));
-      delete[] CONVERT_TO_SHORTPTR(ref_);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+    if (use_high_bit_depth()) {
+      // TODO(skal): remove!
+      src_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(src_));
+      ref_ = reinterpret_cast<uint8_t *>(CONVERT_TO_SHORTPTR(ref_));
     }
+#endif
+
+    aom_free(src_);
+    delete[] ref_;
+    src_ = NULL;
+    ref_ = NULL;
     libaom_test::ClearSystemState();
   }
 
  protected:
+  // We could sub-class MainTestClass into dedicated class for Variance
+  // and MSE/SSE, but it involves a lot of 'this->xxx' dereferencing
+  // to access top class fields xxx. That's cumbersome, so for now we'll just
+  // implement the testing methods here:
+
+  // Variance tests
   void ZeroTest();
   void RefTest();
   void RefStrideTest();
   void OneQuarterTest();
 
+  // MSE/SSE tests
+  void RefTestMse();
+  void RefTestSse();
+  void MaxTestMse();
+  void MaxTestSse();
+
+ protected:
   ACMRandom rnd_;
   uint8_t *src_;
   uint8_t *ref_;
-  int width_, log2width_;
-  int height_, log2height_;
-  aom_bit_depth_t bit_depth_;
-  int mask_;
-  bool use_high_bit_depth_;
-  int block_size_;
-  VarianceFunctionType variance_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  uint32_t mask() const { return params_.mask; }
 };
 
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to variance.
+
 template <typename VarianceFunctionType>
-void VarianceTest<VarianceFunctionType>::ZeroTest() {
+void MainTestClass<VarianceFunctionType>::ZeroTest() {
   for (int i = 0; i <= 255; ++i) {
-    if (!use_high_bit_depth_) {
-      memset(src_, i, block_size_);
-#if CONFIG_AOM_HIGHBITDEPTH
+    if (!use_high_bit_depth()) {
+      memset(src_, i, block_size());
     } else {
-      aom_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),
-                   block_size_);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+      uint16_t *const src16 = CONVERT_TO_SHORTPTR(src_);
+      for (int k = 0; k < block_size(); ++k) src16[k] = i << byte_shift();
     }
     for (int j = 0; j <= 255; ++j) {
-      if (!use_high_bit_depth_) {
-        memset(ref_, j, block_size_);
-#if CONFIG_AOM_HIGHBITDEPTH
+      if (!use_high_bit_depth()) {
+        memset(ref_, j, block_size());
       } else {
-        aom_memset16(CONVERT_TO_SHORTPTR(ref_), j << (bit_depth_ - 8),
-                     block_size_);
-#endif  // CONFIG_AOM_HIGHBITDEPTH
+        uint16_t *const ref16 = CONVERT_TO_SHORTPTR(ref_);
+        for (int k = 0; k < block_size(); ++k) ref16[k] = j << byte_shift();
       }
-      unsigned int sse;
-      unsigned int var;
-      ASM_REGISTER_STATE_CHECK(var =
-                                   variance_(src_, width_, ref_, width_, &sse));
+      unsigned int sse, var;
+      ASM_REGISTER_STATE_CHECK(
+          var = params_.func(src_, width(), ref_, width(), &sse));
       EXPECT_EQ(0u, var) << "src values: " << i << " ref values: " << j;
     }
   }
 }
 
 template <typename VarianceFunctionType>
-void VarianceTest<VarianceFunctionType>::RefTest() {
+void MainTestClass<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
-    for (int j = 0; j < block_size_; j++) {
-      if (!use_high_bit_depth_) {
+    for (int j = 0; j < block_size(); j++) {
+      if (!use_high_bit_depth()) {
         src_[j] = rnd_.Rand8();
         ref_[j] = rnd_.Rand8();
 #if CONFIG_AOM_HIGHBITDEPTH
       } else {
-        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
-        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
 #endif  // CONFIG_AOM_HIGHBITDEPTH
       }
     }
-    unsigned int sse1, sse2;
-    unsigned int var1;
-    const int stride_coeff = 1;
-    ASM_REGISTER_STATE_CHECK(var1 =
-                                 variance_(src_, width_, ref_, width_, &sse1));
-    const unsigned int var2 =
-        variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
-                     stride_coeff, &sse2, use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    unsigned int sse1, sse2, var1, var2;
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(
+        var1 = params_.func(src_, stride, ref_, stride, &sse1));
+    var2 =
+        variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                     stride, &sse2, use_high_bit_depth(), params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2) << "Error at test index: " << i;
   }
 }
 
 template <typename VarianceFunctionType>
-void VarianceTest<VarianceFunctionType>::RefStrideTest() {
+void MainTestClass<VarianceFunctionType>::RefStrideTest() {
   for (int i = 0; i < 10; ++i) {
-    int ref_stride_coeff = i % 2;
-    int src_stride_coeff = (i >> 1) % 2;
-    for (int j = 0; j < block_size_; j++) {
-      int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;
-      int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;
-      if (!use_high_bit_depth_) {
+    const int ref_stride = (i & 1) * width();
+    const int src_stride = ((i >> 1) & 1) * width();
+    for (int j = 0; j < block_size(); j++) {
+      const int ref_ind = (j / width()) * ref_stride + j % width();
+      const int src_ind = (j / width()) * src_stride + j % width();
+      if (!use_high_bit_depth()) {
         src_[src_ind] = rnd_.Rand8();
         ref_[ref_ind] = rnd_.Rand8();
 #if CONFIG_AOM_HIGHBITDEPTH
       } else {
-        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
-        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+        CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() & mask();
 #endif  // CONFIG_AOM_HIGHBITDEPTH
       }
     }
     unsigned int sse1, sse2;
-    unsigned int var1;
+    unsigned int var1, var2;
 
-    ASM_REGISTER_STATE_CHECK(var1 = variance_(src_, width_ * src_stride_coeff,
-                                              ref_, width_ * ref_stride_coeff,
-                                              &sse1));
-    const unsigned int var2 =
-        variance_ref(src_, ref_, log2width_, log2height_, src_stride_coeff,
-                     ref_stride_coeff, &sse2, use_high_bit_depth_, bit_depth_);
-    EXPECT_EQ(sse1, sse2);
-    EXPECT_EQ(var1, var2);
+    ASM_REGISTER_STATE_CHECK(
+        var1 = params_.func(src_, src_stride, ref_, ref_stride, &sse1));
+    var2 = variance_ref(src_, ref_, params_.log2width, params_.log2height,
+                        src_stride, ref_stride, &sse2, use_high_bit_depth(),
+                        params_.bit_depth);
+    EXPECT_EQ(sse1, sse2) << "Error at test index: " << i;
+    EXPECT_EQ(var1, var2) << "Error at test index: " << i;
   }
 }
 
 template <typename VarianceFunctionType>
-void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
-  const int half = block_size_ / 2;
-  if (!use_high_bit_depth_) {
-    memset(src_, 255, block_size_);
+void MainTestClass<VarianceFunctionType>::OneQuarterTest() {
+  const int half = block_size() / 2;
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
     memset(ref_, 255, half);
     memset(ref_ + half, 0, half);
 #if CONFIG_AOM_HIGHBITDEPTH
   } else {
-    aom_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),
-                 block_size_);
-    aom_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);
+    aom_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    aom_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << byte_shift(), half);
     aom_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
 #endif  // CONFIG_AOM_HIGHBITDEPTH
   }
-  unsigned int sse;
-  unsigned int var;
-  ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
-  const unsigned int expected = block_size_ * 255 * 255 / 4;
+  unsigned int sse, var, expected;
+  ASM_REGISTER_STATE_CHECK(
+      var = params_.func(src_, width(), ref_, width(), &sse));
+  expected = block_size() * 255 * 255 / 4;
   EXPECT_EQ(expected, var);
 }
 
-template <typename MseFunctionType>
-class MseTest
-    : public ::testing::TestWithParam<tuple<int, int, MseFunctionType> > {
- public:
-  virtual void SetUp() {
-    const tuple<int, int, MseFunctionType> &params = this->GetParam();
-    log2width_ = get<0>(params);
-    width_ = 1 << log2width_;
-    log2height_ = get<1>(params);
-    height_ = 1 << log2height_;
-    mse_ = get<2>(params);
+////////////////////////////////////////////////////////////////////////////////
+// Tests related to MSE / SSE.
 
-    rnd(ACMRandom::DeterministicSeed());
-    block_size_ = width_ * height_;
-    src_ = reinterpret_cast<uint8_t *>(aom_memalign(16, block_size_));
-    ref_ = new uint8_t[block_size_];
-    ASSERT_TRUE(src_ != NULL);
-    ASSERT_TRUE(ref_ != NULL);
-  }
-
-  virtual void TearDown() {
-    aom_free(src_);
-    delete[] ref_;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RefTest_mse();
-  void RefTest_sse();
-  void MaxTest_mse();
-  void MaxTest_sse();
-
-  ACMRandom rnd;
-  uint8_t *src_;
-  uint8_t *ref_;
-  int width_, log2width_;
-  int height_, log2height_;
-  int block_size_;
-  MseFunctionType mse_;
-};
-
-template <typename MseFunctionType>
-void MseTest<MseFunctionType>::RefTest_mse() {
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestMse() {
   for (int i = 0; i < 10; ++i) {
-    for (int j = 0; j < block_size_; j++) {
-      src_[j] = rnd.Rand8();
-      ref_[j] = rnd.Rand8();
+    for (int j = 0; j < block_size(); ++j) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
     }
     unsigned int sse1, sse2;
-    const int stride_coeff = 1;
-    ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1));
-    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
-                 stride_coeff, &sse2, false, AOM_BITS_8);
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
+    variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                 stride, &sse2, false, AOM_BITS_8);
     EXPECT_EQ(sse1, sse2);
   }
 }
 
-template <typename MseFunctionType>
-void MseTest<MseFunctionType>::RefTest_sse() {
+template <typename FunctionType>
+void MainTestClass<FunctionType>::RefTestSse() {
   for (int i = 0; i < 10; ++i) {
-    for (int j = 0; j < block_size_; j++) {
-      src_[j] = rnd.Rand8();
-      ref_[j] = rnd.Rand8();
+    for (int j = 0; j < block_size(); ++j) {
+      src_[j] = rnd_.Rand8();
+      ref_[j] = rnd_.Rand8();
     }
     unsigned int sse2;
     unsigned int var1;
-    const int stride_coeff = 1;
-    ASM_REGISTER_STATE_CHECK(var1 = mse_(src_, width_, ref_, width_));
-    variance_ref(src_, ref_, log2width_, log2height_, stride_coeff,
-                 stride_coeff, &sse2, false, AOM_BITS_8);
+    const int stride = width();
+    ASM_REGISTER_STATE_CHECK(var1 = params_.func(src_, stride, ref_, stride));
+    variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
+                 stride, &sse2, false, AOM_BITS_8);
     EXPECT_EQ(var1, sse2);
   }
 }
 
-template <typename MseFunctionType>
-void MseTest<MseFunctionType>::MaxTest_mse() {
-  memset(src_, 255, block_size_);
-  memset(ref_, 0, block_size_);
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestMse() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
   unsigned int sse;
-  ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse));
-  const unsigned int expected = block_size_ * 255 * 255;
+  ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
+  const unsigned int expected = block_size() * 255 * 255;
   EXPECT_EQ(expected, sse);
 }
 
-template <typename MseFunctionType>
-void MseTest<MseFunctionType>::MaxTest_sse() {
-  memset(src_, 255, block_size_);
-  memset(ref_, 0, block_size_);
+template <typename FunctionType>
+void MainTestClass<FunctionType>::MaxTestSse() {
+  memset(src_, 255, block_size());
+  memset(ref_, 0, block_size());
   unsigned int var;
-  ASM_REGISTER_STATE_CHECK(var = mse_(src_, width_, ref_, width_));
-  const unsigned int expected = block_size_ * 255 * 255;
+  ASM_REGISTER_STATE_CHECK(var = params_.func(src_, width(), ref_, width()));
+  const unsigned int expected = block_size() * 255 * 255;
   EXPECT_EQ(expected, var);
 }
 
-static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
-                                        const uint8_t *second_pred, int l2w,
-                                        int l2h, int xoff, int yoff,
-                                        uint32_t *sse_ptr,
-                                        bool use_high_bit_depth,
-                                        aom_bit_depth_t bit_depth) {
-  int64_t se = 0;
-  uint64_t sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
+////////////////////////////////////////////////////////////////////////////////
 
-  xoff <<= 1;
-  yoff <<= 1;
-
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      // bilinear interpolation at a 16th pel step
-      if (!use_high_bit_depth) {
-        const int a1 = ref[(w + 1) * (y + 0) + x + 0];
-        const int a2 = ref[(w + 1) * (y + 0) + x + 1];
-        const int b1 = ref[(w + 1) * (y + 1) + x + 0];
-        const int b2 = ref[(w + 1) * (y + 1) + x + 1];
-        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-        const int r = a + (((b - a) * yoff + 8) >> 4);
-        const int diff =
-            ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
-        se += diff;
-        sse += diff * diff;
-#if CONFIG_AOM_HIGHBITDEPTH
-      } else {
-        uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
-        uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
-        uint16_t *sec16 = CONVERT_TO_SHORTPTR(second_pred);
-        const int a1 = ref16[(w + 1) * (y + 0) + x + 0];
-        const int a2 = ref16[(w + 1) * (y + 0) + x + 1];
-        const int b1 = ref16[(w + 1) * (y + 1) + x + 0];
-        const int b2 = ref16[(w + 1) * (y + 1) + x + 1];
-        const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-        const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-        const int r = a + (((b - a) * yoff + 8) >> 4);
-        const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
-        se += diff;
-        sse += diff * diff;
-#endif  // CONFIG_AOM_HIGHBITDEPTH
-      }
-    }
-  }
-  RoundHighBitDepth(bit_depth, &se, &sse);
-  *sse_ptr = static_cast<uint32_t>(sse);
-  return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
-}
+using ::std::tr1::get;
+using ::std::tr1::make_tuple;
+using ::std::tr1::tuple;
 
 template <typename SubpelVarianceFunctionType>
 class SubpelVarianceTest
@@ -690,30 +697,30 @@
         }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
       }
-      unsigned int sse1, sse2;
-      unsigned int var1;
+      uint32_t sse1, sse2;
+      uint32_t var1, var2;
       ASM_REGISTER_STATE_CHECK(var1 =
                                    subpel_variance_(ref_, width_ + 1, x, y,
                                                     src_, width_, &sse1, sec_));
-      const unsigned int var2 =
-          subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_, x,
-                                  y, &sse2, use_high_bit_depth_, bit_depth_);
+      var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_,
+                                     x, y, &sse2, use_high_bit_depth_,
+                                     static_cast<aom_bit_depth_t>(bit_depth_));
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
   }
 }
 
-typedef MseTest<Get4x4SseFunc> AvxSseTest;
-typedef MseTest<VarianceMxNFunc> AvxMseTest;
-typedef VarianceTest<VarianceMxNFunc> AvxVarianceTest;
+typedef MainTestClass<Get4x4SseFunc> AvxSseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
 
-TEST_P(AvxSseTest, Ref_sse) { RefTest_sse(); }
-TEST_P(AvxSseTest, Max_sse) { MaxTest_sse(); }
-TEST_P(AvxMseTest, Ref_mse) { RefTest_mse(); }
-TEST_P(AvxMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
+TEST_P(AvxSseTest, MaxSse) { MaxTestSse(); }
+TEST_P(AvxMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(AvxVarianceTest, Zero) { ZeroTest(); }
 TEST_P(AvxVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxVarianceTest, RefStride) { RefStrideTest(); }
@@ -727,31 +734,34 @@
 INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
                         ::testing::Values(aom_get_mb_ss_c));
 
+typedef TestParams<Get4x4SseFunc> SseParams;
 INSTANTIATE_TEST_CASE_P(C, AvxSseTest,
-                        ::testing::Values(make_tuple(2, 2,
-                                                     &aom_get4x4sse_cs_c)));
+                        ::testing::Values(SseParams(2, 2,
+                                                    &aom_get4x4sse_cs_c)));
 
+typedef TestParams<VarianceMxNFunc> MseParams;
 INSTANTIATE_TEST_CASE_P(C, AvxMseTest,
-                        ::testing::Values(make_tuple(4, 4, &aom_mse16x16_c),
-                                          make_tuple(4, 3, &aom_mse16x8_c),
-                                          make_tuple(3, 4, &aom_mse8x16_c),
-                                          make_tuple(3, 3, &aom_mse8x8_c)));
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_c),
+                                          MseParams(4, 3, &aom_mse16x8_c),
+                                          MseParams(3, 4, &aom_mse8x16_c),
+                                          MseParams(3, 3, &aom_mse8x8_c)));
 
+typedef TestParams<VarianceMxNFunc> VarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, AvxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &aom_variance64x64_c, 0),
-                      make_tuple(6, 5, &aom_variance64x32_c, 0),
-                      make_tuple(5, 6, &aom_variance32x64_c, 0),
-                      make_tuple(5, 5, &aom_variance32x32_c, 0),
-                      make_tuple(5, 4, &aom_variance32x16_c, 0),
-                      make_tuple(4, 5, &aom_variance16x32_c, 0),
-                      make_tuple(4, 4, &aom_variance16x16_c, 0),
-                      make_tuple(4, 3, &aom_variance16x8_c, 0),
-                      make_tuple(3, 4, &aom_variance8x16_c, 0),
-                      make_tuple(3, 3, &aom_variance8x8_c, 0),
-                      make_tuple(3, 2, &aom_variance8x4_c, 0),
-                      make_tuple(2, 3, &aom_variance4x8_c, 0),
-                      make_tuple(2, 2, &aom_variance4x4_c, 0)));
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_c),
+                      VarianceParams(6, 5, &aom_variance64x32_c),
+                      VarianceParams(5, 6, &aom_variance32x64_c),
+                      VarianceParams(5, 5, &aom_variance32x32_c),
+                      VarianceParams(5, 4, &aom_variance32x16_c),
+                      VarianceParams(4, 5, &aom_variance16x32_c),
+                      VarianceParams(4, 4, &aom_variance16x16_c),
+                      VarianceParams(4, 3, &aom_variance16x8_c),
+                      VarianceParams(3, 4, &aom_variance8x16_c),
+                      VarianceParams(3, 3, &aom_variance8x8_c),
+                      VarianceParams(3, 2, &aom_variance8x4_c),
+                      VarianceParams(2, 3, &aom_variance4x8_c),
+                      VarianceParams(2, 2, &aom_variance4x4_c)));
 
 INSTANTIATE_TEST_CASE_P(
     C, AvxSubpelVarianceTest,
@@ -786,13 +796,13 @@
                       make_tuple(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_AOM_HIGHBITDEPTH
-typedef MseTest<VarianceMxNFunc> AvxHBDMseTest;
-typedef VarianceTest<VarianceMxNFunc> AvxHBDVarianceTest;
+typedef MainTestClass<VarianceMxNFunc> AvxHBDMseTest;
+typedef MainTestClass<VarianceMxNFunc> AvxHBDVarianceTest;
 typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxHBDSubpelVarianceTest;
 typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxHBDSubpelAvgVarianceTest;
 
-TEST_P(AvxHBDMseTest, Ref_mse) { RefTest_mse(); }
-TEST_P(AvxHBDMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(AvxHBDMseTest, RefMse) { RefTestMse(); }
+TEST_P(AvxHBDMseTest, MaxMse) { MaxTestMse(); }
 TEST_P(AvxHBDVarianceTest, Zero) { ZeroTest(); }
 TEST_P(AvxHBDVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxHBDVarianceTest, RefStride) { RefStrideTest(); }
@@ -818,133 +828,191 @@
                       make_tuple(4, 4, &aom_highbd_8_mse8x8_c)));
 */
 
-INSTANTIATE_TEST_CASE_P(
-    C, AvxHBDVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &aom_highbd_12_variance64x64_c, 12),
-                      make_tuple(6, 5, &aom_highbd_12_variance64x32_c, 12),
-                      make_tuple(5, 6, &aom_highbd_12_variance32x64_c, 12),
-                      make_tuple(5, 5, &aom_highbd_12_variance32x32_c, 12),
-                      make_tuple(5, 4, &aom_highbd_12_variance32x16_c, 12),
-                      make_tuple(4, 5, &aom_highbd_12_variance16x32_c, 12),
-                      make_tuple(4, 4, &aom_highbd_12_variance16x16_c, 12),
-                      make_tuple(4, 3, &aom_highbd_12_variance16x8_c, 12),
-                      make_tuple(3, 4, &aom_highbd_12_variance8x16_c, 12),
-                      make_tuple(3, 3, &aom_highbd_12_variance8x8_c, 12),
-                      make_tuple(3, 2, &aom_highbd_12_variance8x4_c, 12),
-                      make_tuple(2, 3, &aom_highbd_12_variance4x8_c, 12),
-                      make_tuple(2, 2, &aom_highbd_12_variance4x4_c, 12),
-                      make_tuple(6, 6, &aom_highbd_10_variance64x64_c, 10),
-                      make_tuple(6, 5, &aom_highbd_10_variance64x32_c, 10),
-                      make_tuple(5, 6, &aom_highbd_10_variance32x64_c, 10),
-                      make_tuple(5, 5, &aom_highbd_10_variance32x32_c, 10),
-                      make_tuple(5, 4, &aom_highbd_10_variance32x16_c, 10),
-                      make_tuple(4, 5, &aom_highbd_10_variance16x32_c, 10),
-                      make_tuple(4, 4, &aom_highbd_10_variance16x16_c, 10),
-                      make_tuple(4, 3, &aom_highbd_10_variance16x8_c, 10),
-                      make_tuple(3, 4, &aom_highbd_10_variance8x16_c, 10),
-                      make_tuple(3, 3, &aom_highbd_10_variance8x8_c, 10),
-                      make_tuple(3, 2, &aom_highbd_10_variance8x4_c, 10),
-                      make_tuple(2, 3, &aom_highbd_10_variance4x8_c, 10),
-                      make_tuple(2, 2, &aom_highbd_10_variance4x4_c, 10),
-                      make_tuple(6, 6, &aom_highbd_8_variance64x64_c, 8),
-                      make_tuple(6, 5, &aom_highbd_8_variance64x32_c, 8),
-                      make_tuple(5, 6, &aom_highbd_8_variance32x64_c, 8),
-                      make_tuple(5, 5, &aom_highbd_8_variance32x32_c, 8),
-                      make_tuple(5, 4, &aom_highbd_8_variance32x16_c, 8),
-                      make_tuple(4, 5, &aom_highbd_8_variance16x32_c, 8),
-                      make_tuple(4, 4, &aom_highbd_8_variance16x16_c, 8),
-                      make_tuple(4, 3, &aom_highbd_8_variance16x8_c, 8),
-                      make_tuple(3, 4, &aom_highbd_8_variance8x16_c, 8),
-                      make_tuple(3, 3, &aom_highbd_8_variance8x8_c, 8),
-                      make_tuple(3, 2, &aom_highbd_8_variance8x4_c, 8),
-                      make_tuple(2, 3, &aom_highbd_8_variance4x8_c, 8),
-                      make_tuple(2, 2, &aom_highbd_8_variance4x4_c, 8)));
+const VarianceParams kArrayHBDVariance_c[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12),
+  VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12),
+  VarianceParams(6, 7, &aom_highbd_12_variance64x128_c, 12),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  VarianceParams(6, 6, &aom_highbd_12_variance64x64_c, 12),
+  VarianceParams(6, 5, &aom_highbd_12_variance64x32_c, 12),
+  VarianceParams(5, 6, &aom_highbd_12_variance32x64_c, 12),
+  VarianceParams(5, 5, &aom_highbd_12_variance32x32_c, 12),
+  VarianceParams(5, 4, &aom_highbd_12_variance32x16_c, 12),
+  VarianceParams(4, 5, &aom_highbd_12_variance16x32_c, 12),
+  VarianceParams(4, 4, &aom_highbd_12_variance16x16_c, 12),
+  VarianceParams(4, 3, &aom_highbd_12_variance16x8_c, 12),
+  VarianceParams(3, 4, &aom_highbd_12_variance8x16_c, 12),
+  VarianceParams(3, 3, &aom_highbd_12_variance8x8_c, 12),
+  VarianceParams(3, 2, &aom_highbd_12_variance8x4_c, 12),
+  VarianceParams(2, 3, &aom_highbd_12_variance4x8_c, 12),
+  VarianceParams(2, 2, &aom_highbd_12_variance4x4_c, 12),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  VarianceParams(7, 7, &aom_highbd_10_variance128x128_c, 10),
+  VarianceParams(7, 6, &aom_highbd_10_variance128x64_c, 10),
+  VarianceParams(6, 7, &aom_highbd_10_variance64x128_c, 10),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  VarianceParams(6, 6, &aom_highbd_10_variance64x64_c, 10),
+  VarianceParams(6, 5, &aom_highbd_10_variance64x32_c, 10),
+  VarianceParams(5, 6, &aom_highbd_10_variance32x64_c, 10),
+  VarianceParams(5, 5, &aom_highbd_10_variance32x32_c, 10),
+  VarianceParams(5, 4, &aom_highbd_10_variance32x16_c, 10),
+  VarianceParams(4, 5, &aom_highbd_10_variance16x32_c, 10),
+  VarianceParams(4, 4, &aom_highbd_10_variance16x16_c, 10),
+  VarianceParams(4, 3, &aom_highbd_10_variance16x8_c, 10),
+  VarianceParams(3, 4, &aom_highbd_10_variance8x16_c, 10),
+  VarianceParams(3, 3, &aom_highbd_10_variance8x8_c, 10),
+  VarianceParams(3, 2, &aom_highbd_10_variance8x4_c, 10),
+  VarianceParams(2, 3, &aom_highbd_10_variance4x8_c, 10),
+  VarianceParams(2, 2, &aom_highbd_10_variance4x4_c, 10),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  VarianceParams(7, 7, &aom_highbd_8_variance128x128_c, 8),
+  VarianceParams(7, 6, &aom_highbd_8_variance128x64_c, 8),
+  VarianceParams(6, 7, &aom_highbd_8_variance64x128_c, 8),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  VarianceParams(6, 6, &aom_highbd_8_variance64x64_c, 8),
+  VarianceParams(6, 5, &aom_highbd_8_variance64x32_c, 8),
+  VarianceParams(5, 6, &aom_highbd_8_variance32x64_c, 8),
+  VarianceParams(5, 5, &aom_highbd_8_variance32x32_c, 8),
+  VarianceParams(5, 4, &aom_highbd_8_variance32x16_c, 8),
+  VarianceParams(4, 5, &aom_highbd_8_variance16x32_c, 8),
+  VarianceParams(4, 4, &aom_highbd_8_variance16x16_c, 8),
+  VarianceParams(4, 3, &aom_highbd_8_variance16x8_c, 8),
+  VarianceParams(3, 4, &aom_highbd_8_variance8x16_c, 8),
+  VarianceParams(3, 3, &aom_highbd_8_variance8x8_c, 8),
+  VarianceParams(3, 2, &aom_highbd_8_variance8x4_c, 8),
+  VarianceParams(2, 3, &aom_highbd_8_variance4x8_c, 8),
+  VarianceParams(2, 2, &aom_highbd_8_variance4x4_c, 8)
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDVariance_c));
 
+#if HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
-    C, AvxHBDSubpelVarianceTest,
+    SSE4_1, AvxHBDVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &aom_highbd_8_sub_pixel_variance64x64_c, 8),
-        make_tuple(6, 5, &aom_highbd_8_sub_pixel_variance64x32_c, 8),
-        make_tuple(5, 6, &aom_highbd_8_sub_pixel_variance32x64_c, 8),
-        make_tuple(5, 5, &aom_highbd_8_sub_pixel_variance32x32_c, 8),
-        make_tuple(5, 4, &aom_highbd_8_sub_pixel_variance32x16_c, 8),
-        make_tuple(4, 5, &aom_highbd_8_sub_pixel_variance16x32_c, 8),
-        make_tuple(4, 4, &aom_highbd_8_sub_pixel_variance16x16_c, 8),
-        make_tuple(4, 3, &aom_highbd_8_sub_pixel_variance16x8_c, 8),
-        make_tuple(3, 4, &aom_highbd_8_sub_pixel_variance8x16_c, 8),
-        make_tuple(3, 3, &aom_highbd_8_sub_pixel_variance8x8_c, 8),
-        make_tuple(3, 2, &aom_highbd_8_sub_pixel_variance8x4_c, 8),
-        make_tuple(2, 3, &aom_highbd_8_sub_pixel_variance4x8_c, 8),
-        make_tuple(2, 2, &aom_highbd_8_sub_pixel_variance4x4_c, 8),
-        make_tuple(6, 6, &aom_highbd_10_sub_pixel_variance64x64_c, 10),
-        make_tuple(6, 5, &aom_highbd_10_sub_pixel_variance64x32_c, 10),
-        make_tuple(5, 6, &aom_highbd_10_sub_pixel_variance32x64_c, 10),
-        make_tuple(5, 5, &aom_highbd_10_sub_pixel_variance32x32_c, 10),
-        make_tuple(5, 4, &aom_highbd_10_sub_pixel_variance32x16_c, 10),
-        make_tuple(4, 5, &aom_highbd_10_sub_pixel_variance16x32_c, 10),
-        make_tuple(4, 4, &aom_highbd_10_sub_pixel_variance16x16_c, 10),
-        make_tuple(4, 3, &aom_highbd_10_sub_pixel_variance16x8_c, 10),
-        make_tuple(3, 4, &aom_highbd_10_sub_pixel_variance8x16_c, 10),
-        make_tuple(3, 3, &aom_highbd_10_sub_pixel_variance8x8_c, 10),
-        make_tuple(3, 2, &aom_highbd_10_sub_pixel_variance8x4_c, 10),
-        make_tuple(2, 3, &aom_highbd_10_sub_pixel_variance4x8_c, 10),
-        make_tuple(2, 2, &aom_highbd_10_sub_pixel_variance4x4_c, 10),
-        make_tuple(6, 6, &aom_highbd_12_sub_pixel_variance64x64_c, 12),
-        make_tuple(6, 5, &aom_highbd_12_sub_pixel_variance64x32_c, 12),
-        make_tuple(5, 6, &aom_highbd_12_sub_pixel_variance32x64_c, 12),
-        make_tuple(5, 5, &aom_highbd_12_sub_pixel_variance32x32_c, 12),
-        make_tuple(5, 4, &aom_highbd_12_sub_pixel_variance32x16_c, 12),
-        make_tuple(4, 5, &aom_highbd_12_sub_pixel_variance16x32_c, 12),
-        make_tuple(4, 4, &aom_highbd_12_sub_pixel_variance16x16_c, 12),
-        make_tuple(4, 3, &aom_highbd_12_sub_pixel_variance16x8_c, 12),
-        make_tuple(3, 4, &aom_highbd_12_sub_pixel_variance8x16_c, 12),
-        make_tuple(3, 3, &aom_highbd_12_sub_pixel_variance8x8_c, 12),
-        make_tuple(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
-        make_tuple(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
-        make_tuple(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12)));
+        VarianceParams(2, 2, &aom_highbd_8_variance4x4_sse4_1, 8),
+        VarianceParams(2, 2, &aom_highbd_10_variance4x4_sse4_1, 10),
+        VarianceParams(2, 2, &aom_highbd_12_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
 
-INSTANTIATE_TEST_CASE_P(
-    C, AvxHBDSubpelAvgVarianceTest,
-    ::testing::Values(
-        make_tuple(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_c, 8),
-        make_tuple(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_c, 8),
-        make_tuple(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_c, 8),
-        make_tuple(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_c, 8),
-        make_tuple(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_c, 8),
-        make_tuple(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_c, 8),
-        make_tuple(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_c, 8),
-        make_tuple(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_c, 8),
-        make_tuple(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_c, 8),
-        make_tuple(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_c, 8),
-        make_tuple(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_c, 8),
-        make_tuple(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_c, 8),
-        make_tuple(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_c, 8),
-        make_tuple(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_c, 10),
-        make_tuple(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_c, 10),
-        make_tuple(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_c, 10),
-        make_tuple(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_c, 10),
-        make_tuple(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_c, 10),
-        make_tuple(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_c, 10),
-        make_tuple(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_c, 10),
-        make_tuple(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_c, 10),
-        make_tuple(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_c, 10),
-        make_tuple(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_c, 10),
-        make_tuple(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_c, 10),
-        make_tuple(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_c, 10),
-        make_tuple(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_c, 10),
-        make_tuple(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_c, 12),
-        make_tuple(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_c, 12),
-        make_tuple(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_c, 12),
-        make_tuple(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_c, 12),
-        make_tuple(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_c, 12),
-        make_tuple(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_c, 12),
-        make_tuple(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_c, 12),
-        make_tuple(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_c, 12),
-        make_tuple(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_c, 12),
-        make_tuple(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_c, 12),
-        make_tuple(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_c, 12),
-        make_tuple(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
-        make_tuple(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+const AvxHBDSubpelVarianceTest::ParamType kArrayHBDSubpelVariance_c[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(7, 7, &aom_highbd_8_sub_pixel_variance128x128_c, 8),
+  make_tuple(7, 6, &aom_highbd_8_sub_pixel_variance128x64_c, 8),
+  make_tuple(6, 7, &aom_highbd_8_sub_pixel_variance64x128_c, 8),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(6, 6, &aom_highbd_8_sub_pixel_variance64x64_c, 8),
+  make_tuple(6, 5, &aom_highbd_8_sub_pixel_variance64x32_c, 8),
+  make_tuple(5, 6, &aom_highbd_8_sub_pixel_variance32x64_c, 8),
+  make_tuple(5, 5, &aom_highbd_8_sub_pixel_variance32x32_c, 8),
+  make_tuple(5, 4, &aom_highbd_8_sub_pixel_variance32x16_c, 8),
+  make_tuple(4, 5, &aom_highbd_8_sub_pixel_variance16x32_c, 8),
+  make_tuple(4, 4, &aom_highbd_8_sub_pixel_variance16x16_c, 8),
+  make_tuple(4, 3, &aom_highbd_8_sub_pixel_variance16x8_c, 8),
+  make_tuple(3, 4, &aom_highbd_8_sub_pixel_variance8x16_c, 8),
+  make_tuple(3, 3, &aom_highbd_8_sub_pixel_variance8x8_c, 8),
+  make_tuple(3, 2, &aom_highbd_8_sub_pixel_variance8x4_c, 8),
+  make_tuple(2, 3, &aom_highbd_8_sub_pixel_variance4x8_c, 8),
+  make_tuple(2, 2, &aom_highbd_8_sub_pixel_variance4x4_c, 8),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(7, 7, &aom_highbd_10_sub_pixel_variance128x128_c, 10),
+  make_tuple(7, 6, &aom_highbd_10_sub_pixel_variance128x64_c, 10),
+  make_tuple(6, 7, &aom_highbd_10_sub_pixel_variance64x128_c, 10),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(6, 6, &aom_highbd_10_sub_pixel_variance64x64_c, 10),
+  make_tuple(6, 5, &aom_highbd_10_sub_pixel_variance64x32_c, 10),
+  make_tuple(5, 6, &aom_highbd_10_sub_pixel_variance32x64_c, 10),
+  make_tuple(5, 5, &aom_highbd_10_sub_pixel_variance32x32_c, 10),
+  make_tuple(5, 4, &aom_highbd_10_sub_pixel_variance32x16_c, 10),
+  make_tuple(4, 5, &aom_highbd_10_sub_pixel_variance16x32_c, 10),
+  make_tuple(4, 4, &aom_highbd_10_sub_pixel_variance16x16_c, 10),
+  make_tuple(4, 3, &aom_highbd_10_sub_pixel_variance16x8_c, 10),
+  make_tuple(3, 4, &aom_highbd_10_sub_pixel_variance8x16_c, 10),
+  make_tuple(3, 3, &aom_highbd_10_sub_pixel_variance8x8_c, 10),
+  make_tuple(3, 2, &aom_highbd_10_sub_pixel_variance8x4_c, 10),
+  make_tuple(2, 3, &aom_highbd_10_sub_pixel_variance4x8_c, 10),
+  make_tuple(2, 2, &aom_highbd_10_sub_pixel_variance4x4_c, 10),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(7, 7, &aom_highbd_12_sub_pixel_variance128x128_c, 12),
+  make_tuple(7, 6, &aom_highbd_12_sub_pixel_variance128x64_c, 12),
+  make_tuple(6, 7, &aom_highbd_12_sub_pixel_variance64x128_c, 12),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(6, 6, &aom_highbd_12_sub_pixel_variance64x64_c, 12),
+  make_tuple(6, 5, &aom_highbd_12_sub_pixel_variance64x32_c, 12),
+  make_tuple(5, 6, &aom_highbd_12_sub_pixel_variance32x64_c, 12),
+  make_tuple(5, 5, &aom_highbd_12_sub_pixel_variance32x32_c, 12),
+  make_tuple(5, 4, &aom_highbd_12_sub_pixel_variance32x16_c, 12),
+  make_tuple(4, 5, &aom_highbd_12_sub_pixel_variance16x32_c, 12),
+  make_tuple(4, 4, &aom_highbd_12_sub_pixel_variance16x16_c, 12),
+  make_tuple(4, 3, &aom_highbd_12_sub_pixel_variance16x8_c, 12),
+  make_tuple(3, 4, &aom_highbd_12_sub_pixel_variance8x16_c, 12),
+  make_tuple(3, 3, &aom_highbd_12_sub_pixel_variance8x8_c, 12),
+  make_tuple(3, 2, &aom_highbd_12_sub_pixel_variance8x4_c, 12),
+  make_tuple(2, 3, &aom_highbd_12_sub_pixel_variance4x8_c, 12),
+  make_tuple(2, 2, &aom_highbd_12_sub_pixel_variance4x4_c, 12),
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
+
+const AvxHBDSubpelAvgVarianceTest::ParamType kArrayHBDSubpelAvgVariance_c[] = {
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(7, 7, &aom_highbd_8_sub_pixel_avg_variance128x128_c, 8),
+  make_tuple(7, 6, &aom_highbd_8_sub_pixel_avg_variance128x64_c, 8),
+  make_tuple(6, 7, &aom_highbd_8_sub_pixel_avg_variance64x128_c, 8),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(6, 6, &aom_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+  make_tuple(6, 5, &aom_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+  make_tuple(5, 6, &aom_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+  make_tuple(5, 5, &aom_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+  make_tuple(5, 4, &aom_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+  make_tuple(4, 5, &aom_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+  make_tuple(4, 4, &aom_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+  make_tuple(4, 3, &aom_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+  make_tuple(3, 4, &aom_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+  make_tuple(3, 3, &aom_highbd_8_sub_pixel_avg_variance8x8_c, 8),
+  make_tuple(3, 2, &aom_highbd_8_sub_pixel_avg_variance8x4_c, 8),
+  make_tuple(2, 3, &aom_highbd_8_sub_pixel_avg_variance4x8_c, 8),
+  make_tuple(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(7, 7, &aom_highbd_10_sub_pixel_avg_variance128x128_c, 10),
+  make_tuple(7, 6, &aom_highbd_10_sub_pixel_avg_variance128x64_c, 10),
+  make_tuple(6, 7, &aom_highbd_10_sub_pixel_avg_variance64x128_c, 10),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(6, 6, &aom_highbd_10_sub_pixel_avg_variance64x64_c, 10),
+  make_tuple(6, 5, &aom_highbd_10_sub_pixel_avg_variance64x32_c, 10),
+  make_tuple(5, 6, &aom_highbd_10_sub_pixel_avg_variance32x64_c, 10),
+  make_tuple(5, 5, &aom_highbd_10_sub_pixel_avg_variance32x32_c, 10),
+  make_tuple(5, 4, &aom_highbd_10_sub_pixel_avg_variance32x16_c, 10),
+  make_tuple(4, 5, &aom_highbd_10_sub_pixel_avg_variance16x32_c, 10),
+  make_tuple(4, 4, &aom_highbd_10_sub_pixel_avg_variance16x16_c, 10),
+  make_tuple(4, 3, &aom_highbd_10_sub_pixel_avg_variance16x8_c, 10),
+  make_tuple(3, 4, &aom_highbd_10_sub_pixel_avg_variance8x16_c, 10),
+  make_tuple(3, 3, &aom_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+  make_tuple(3, 2, &aom_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+  make_tuple(2, 3, &aom_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+  make_tuple(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+#if CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(7, 7, &aom_highbd_12_sub_pixel_avg_variance128x128_c, 12),
+  make_tuple(7, 6, &aom_highbd_12_sub_pixel_avg_variance128x64_c, 12),
+  make_tuple(6, 7, &aom_highbd_12_sub_pixel_avg_variance64x128_c, 12),
+#endif  // CONFIG_AV1 && CONFIG_EXT_PARTITION
+  make_tuple(6, 6, &aom_highbd_12_sub_pixel_avg_variance64x64_c, 12),
+  make_tuple(6, 5, &aom_highbd_12_sub_pixel_avg_variance64x32_c, 12),
+  make_tuple(5, 6, &aom_highbd_12_sub_pixel_avg_variance32x64_c, 12),
+  make_tuple(5, 5, &aom_highbd_12_sub_pixel_avg_variance32x32_c, 12),
+  make_tuple(5, 4, &aom_highbd_12_sub_pixel_avg_variance32x16_c, 12),
+  make_tuple(4, 5, &aom_highbd_12_sub_pixel_avg_variance16x32_c, 12),
+  make_tuple(4, 4, &aom_highbd_12_sub_pixel_avg_variance16x16_c, 12),
+  make_tuple(4, 3, &aom_highbd_12_sub_pixel_avg_variance16x8_c, 12),
+  make_tuple(3, 4, &aom_highbd_12_sub_pixel_avg_variance8x16_c, 12),
+  make_tuple(3, 3, &aom_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+  make_tuple(3, 2, &aom_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+  make_tuple(2, 3, &aom_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+  make_tuple(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_c, 12)
+};
+INSTANTIATE_TEST_CASE_P(C, AvxHBDSubpelAvgVarianceTest,
+                        ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
 #if HAVE_SSE2
@@ -952,26 +1020,26 @@
                         ::testing::Values(aom_get_mb_ss_sse2));
 
 INSTANTIATE_TEST_CASE_P(SSE2, AvxMseTest,
-                        ::testing::Values(make_tuple(4, 4, &aom_mse16x16_sse2),
-                                          make_tuple(4, 3, &aom_mse16x8_sse2),
-                                          make_tuple(3, 4, &aom_mse8x16_sse2),
-                                          make_tuple(3, 3, &aom_mse8x8_sse2)));
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_sse2),
+                                          MseParams(4, 3, &aom_mse16x8_sse2),
+                                          MseParams(3, 4, &aom_mse8x16_sse2),
+                                          MseParams(3, 3, &aom_mse8x8_sse2)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &aom_variance64x64_sse2, 0),
-                      make_tuple(6, 5, &aom_variance64x32_sse2, 0),
-                      make_tuple(5, 6, &aom_variance32x64_sse2, 0),
-                      make_tuple(5, 5, &aom_variance32x32_sse2, 0),
-                      make_tuple(5, 4, &aom_variance32x16_sse2, 0),
-                      make_tuple(4, 5, &aom_variance16x32_sse2, 0),
-                      make_tuple(4, 4, &aom_variance16x16_sse2, 0),
-                      make_tuple(4, 3, &aom_variance16x8_sse2, 0),
-                      make_tuple(3, 4, &aom_variance8x16_sse2, 0),
-                      make_tuple(3, 3, &aom_variance8x8_sse2, 0),
-                      make_tuple(3, 2, &aom_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &aom_variance4x8_sse2, 0),
-                      make_tuple(2, 2, &aom_variance4x4_sse2, 0)));
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_sse2),
+                      VarianceParams(6, 5, &aom_variance64x32_sse2),
+                      VarianceParams(5, 6, &aom_variance32x64_sse2),
+                      VarianceParams(5, 5, &aom_variance32x32_sse2),
+                      VarianceParams(5, 4, &aom_variance32x16_sse2),
+                      VarianceParams(4, 5, &aom_variance16x32_sse2),
+                      VarianceParams(4, 4, &aom_variance16x16_sse2),
+                      VarianceParams(4, 3, &aom_variance16x8_sse2),
+                      VarianceParams(3, 4, &aom_variance8x16_sse2),
+                      VarianceParams(3, 3, &aom_variance8x8_sse2),
+                      VarianceParams(3, 2, &aom_variance8x4_sse2),
+                      VarianceParams(2, 3, &aom_variance4x8_sse2),
+                      VarianceParams(2, 2, &aom_variance4x4_sse2)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxSubpelVarianceTest,
@@ -1006,56 +1074,73 @@
         make_tuple(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0),
         make_tuple(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0)));
 
+#if HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AvxSubpelVarianceTest,
+    ::testing::Values(
+        make_tuple(2, 2, &aom_highbd_8_sub_pixel_variance4x4_sse4_1, 8),
+        make_tuple(2, 2, &aom_highbd_10_sub_pixel_variance4x4_sse4_1, 10),
+        make_tuple(2, 2, &aom_highbd_12_sub_pixel_variance4x4_sse4_1, 12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AvxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(2, 2, &aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1, 8),
+        make_tuple(2, 2, &aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1, 10),
+        make_tuple(2, 2, &aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_AOM_HIGHBITDEPTH
+
 #if CONFIG_AOM_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxHBDMseTest,
-    ::testing::Values(make_tuple(4, 4, &aom_highbd_12_mse16x16_sse2),
-                      make_tuple(4, 3, &aom_highbd_12_mse16x8_sse2),
-                      make_tuple(3, 4, &aom_highbd_12_mse8x16_sse2),
-                      make_tuple(3, 3, &aom_highbd_12_mse8x8_sse2),
-                      make_tuple(4, 4, &aom_highbd_10_mse16x16_sse2),
-                      make_tuple(4, 3, &aom_highbd_10_mse16x8_sse2),
-                      make_tuple(3, 4, &aom_highbd_10_mse8x16_sse2),
-                      make_tuple(3, 3, &aom_highbd_10_mse8x8_sse2),
-                      make_tuple(4, 4, &aom_highbd_8_mse16x16_sse2),
-                      make_tuple(4, 3, &aom_highbd_8_mse16x8_sse2),
-                      make_tuple(3, 4, &aom_highbd_8_mse8x16_sse2),
-                      make_tuple(3, 3, &aom_highbd_8_mse8x8_sse2)));
+    ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_12_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_12_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_12_mse8x8_sse2),
+                      MseParams(4, 4, &aom_highbd_10_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_10_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_10_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_10_mse8x8_sse2),
+                      MseParams(4, 4, &aom_highbd_8_mse16x16_sse2),
+                      MseParams(4, 3, &aom_highbd_8_mse16x8_sse2),
+                      MseParams(3, 4, &aom_highbd_8_mse8x16_sse2),
+                      MseParams(3, 3, &aom_highbd_8_mse8x8_sse2)));
 */
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxHBDVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &aom_highbd_12_variance64x64_sse2, 12),
-                      make_tuple(6, 5, &aom_highbd_12_variance64x32_sse2, 12),
-                      make_tuple(5, 6, &aom_highbd_12_variance32x64_sse2, 12),
-                      make_tuple(5, 5, &aom_highbd_12_variance32x32_sse2, 12),
-                      make_tuple(5, 4, &aom_highbd_12_variance32x16_sse2, 12),
-                      make_tuple(4, 5, &aom_highbd_12_variance16x32_sse2, 12),
-                      make_tuple(4, 4, &aom_highbd_12_variance16x16_sse2, 12),
-                      make_tuple(4, 3, &aom_highbd_12_variance16x8_sse2, 12),
-                      make_tuple(3, 4, &aom_highbd_12_variance8x16_sse2, 12),
-                      make_tuple(3, 3, &aom_highbd_12_variance8x8_sse2, 12),
-                      make_tuple(6, 6, &aom_highbd_10_variance64x64_sse2, 10),
-                      make_tuple(6, 5, &aom_highbd_10_variance64x32_sse2, 10),
-                      make_tuple(5, 6, &aom_highbd_10_variance32x64_sse2, 10),
-                      make_tuple(5, 5, &aom_highbd_10_variance32x32_sse2, 10),
-                      make_tuple(5, 4, &aom_highbd_10_variance32x16_sse2, 10),
-                      make_tuple(4, 5, &aom_highbd_10_variance16x32_sse2, 10),
-                      make_tuple(4, 4, &aom_highbd_10_variance16x16_sse2, 10),
-                      make_tuple(4, 3, &aom_highbd_10_variance16x8_sse2, 10),
-                      make_tuple(3, 4, &aom_highbd_10_variance8x16_sse2, 10),
-                      make_tuple(3, 3, &aom_highbd_10_variance8x8_sse2, 10),
-                      make_tuple(6, 6, &aom_highbd_8_variance64x64_sse2, 8),
-                      make_tuple(6, 5, &aom_highbd_8_variance64x32_sse2, 8),
-                      make_tuple(5, 6, &aom_highbd_8_variance32x64_sse2, 8),
-                      make_tuple(5, 5, &aom_highbd_8_variance32x32_sse2, 8),
-                      make_tuple(5, 4, &aom_highbd_8_variance32x16_sse2, 8),
-                      make_tuple(4, 5, &aom_highbd_8_variance16x32_sse2, 8),
-                      make_tuple(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
-                      make_tuple(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
-                      make_tuple(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
-                      make_tuple(3, 3, &aom_highbd_8_variance8x8_sse2, 8)));
+    ::testing::Values(
+        VarianceParams(6, 6, &aom_highbd_12_variance64x64_sse2, 12),
+        VarianceParams(6, 5, &aom_highbd_12_variance64x32_sse2, 12),
+        VarianceParams(5, 6, &aom_highbd_12_variance32x64_sse2, 12),
+        VarianceParams(5, 5, &aom_highbd_12_variance32x32_sse2, 12),
+        VarianceParams(5, 4, &aom_highbd_12_variance32x16_sse2, 12),
+        VarianceParams(4, 5, &aom_highbd_12_variance16x32_sse2, 12),
+        VarianceParams(4, 4, &aom_highbd_12_variance16x16_sse2, 12),
+        VarianceParams(4, 3, &aom_highbd_12_variance16x8_sse2, 12),
+        VarianceParams(3, 4, &aom_highbd_12_variance8x16_sse2, 12),
+        VarianceParams(3, 3, &aom_highbd_12_variance8x8_sse2, 12),
+        VarianceParams(6, 6, &aom_highbd_10_variance64x64_sse2, 10),
+        VarianceParams(6, 5, &aom_highbd_10_variance64x32_sse2, 10),
+        VarianceParams(5, 6, &aom_highbd_10_variance32x64_sse2, 10),
+        VarianceParams(5, 5, &aom_highbd_10_variance32x32_sse2, 10),
+        VarianceParams(5, 4, &aom_highbd_10_variance32x16_sse2, 10),
+        VarianceParams(4, 5, &aom_highbd_10_variance16x32_sse2, 10),
+        VarianceParams(4, 4, &aom_highbd_10_variance16x16_sse2, 10),
+        VarianceParams(4, 3, &aom_highbd_10_variance16x8_sse2, 10),
+        VarianceParams(3, 4, &aom_highbd_10_variance8x16_sse2, 10),
+        VarianceParams(3, 3, &aom_highbd_10_variance8x8_sse2, 10),
+        VarianceParams(6, 6, &aom_highbd_8_variance64x64_sse2, 8),
+        VarianceParams(6, 5, &aom_highbd_8_variance64x32_sse2, 8),
+        VarianceParams(5, 6, &aom_highbd_8_variance32x64_sse2, 8),
+        VarianceParams(5, 5, &aom_highbd_8_variance32x32_sse2, 8),
+        VarianceParams(5, 4, &aom_highbd_8_variance32x16_sse2, 8),
+        VarianceParams(4, 5, &aom_highbd_8_variance16x32_sse2, 8),
+        VarianceParams(4, 4, &aom_highbd_8_variance16x16_sse2, 8),
+        VarianceParams(4, 3, &aom_highbd_8_variance16x8_sse2, 8),
+        VarianceParams(3, 4, &aom_highbd_8_variance8x16_sse2, 8),
+        VarianceParams(3, 3, &aom_highbd_8_variance8x8_sse2, 8)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, AvxHBDSubpelVarianceTest,
@@ -1170,16 +1255,15 @@
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2, AvxMseTest,
-                        ::testing::Values(make_tuple(4, 4,
-                                                     &aom_mse16x16_avx2)));
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_avx2)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, AvxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &aom_variance64x64_avx2, 0),
-                      make_tuple(6, 5, &aom_variance64x32_avx2, 0),
-                      make_tuple(5, 5, &aom_variance32x32_avx2, 0),
-                      make_tuple(5, 4, &aom_variance32x16_avx2, 0),
-                      make_tuple(4, 4, &aom_variance16x16_avx2, 0)));
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_avx2),
+                      VarianceParams(6, 5, &aom_variance64x32_avx2),
+                      VarianceParams(5, 5, &aom_variance32x32_avx2),
+                      VarianceParams(5, 4, &aom_variance32x16_avx2),
+                      VarianceParams(4, 4, &aom_variance16x16_avx2)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, AvxSubpelVarianceTest,
@@ -1195,13 +1279,13 @@
 
 #if HAVE_MEDIA
 INSTANTIATE_TEST_CASE_P(MEDIA, AvxMseTest,
-                        ::testing::Values(make_tuple(4, 4,
-                                                     &aom_mse16x16_media)));
+                        ::testing::Values(MseParams(4, 4,
+                                                    &aom_mse16x16_media)));
 
 INSTANTIATE_TEST_CASE_P(
     MEDIA, AvxVarianceTest,
-    ::testing::Values(make_tuple(4, 4, &aom_variance16x16_media, 0),
-                      make_tuple(3, 3, &aom_variance8x8_media, 0)));
+    ::testing::Values(VarianceParams(4, 4, &aom_variance16x16_media),
+                      VarianceParams(3, 3, &aom_variance8x8_media)));
 
 INSTANTIATE_TEST_CASE_P(
     MEDIA, AvxSubpelVarianceTest,
@@ -1211,23 +1295,22 @@
 
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, AvxSseTest,
-                        ::testing::Values(make_tuple(2, 2,
-                                                     &aom_get4x4sse_cs_neon)));
+                        ::testing::Values(SseParams(2, 2,
+                                                    &aom_get4x4sse_cs_neon)));
 
 INSTANTIATE_TEST_CASE_P(NEON, AvxMseTest,
-                        ::testing::Values(make_tuple(4, 4,
-                                                     &aom_mse16x16_neon)));
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_neon)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, AvxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &aom_variance64x64_neon, 0),
-                      make_tuple(6, 5, &aom_variance64x32_neon, 0),
-                      make_tuple(5, 6, &aom_variance32x64_neon, 0),
-                      make_tuple(5, 5, &aom_variance32x32_neon, 0),
-                      make_tuple(4, 4, &aom_variance16x16_neon, 0),
-                      make_tuple(4, 3, &aom_variance16x8_neon, 0),
-                      make_tuple(3, 4, &aom_variance8x16_neon, 0),
-                      make_tuple(3, 3, &aom_variance8x8_neon, 0)));
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_neon),
+                      VarianceParams(6, 5, &aom_variance64x32_neon),
+                      VarianceParams(5, 6, &aom_variance32x64_neon),
+                      VarianceParams(5, 5, &aom_variance32x32_neon),
+                      VarianceParams(4, 4, &aom_variance16x16_neon),
+                      VarianceParams(4, 3, &aom_variance16x8_neon),
+                      VarianceParams(3, 4, &aom_variance8x16_neon),
+                      VarianceParams(3, 3, &aom_variance8x8_neon)));
 
 INSTANTIATE_TEST_CASE_P(
     NEON, AvxSubpelVarianceTest,
@@ -1242,30 +1325,30 @@
                         ::testing::Values(aom_get_mb_ss_msa));
 
 INSTANTIATE_TEST_CASE_P(MSA, AvxSseTest,
-                        ::testing::Values(make_tuple(2, 2,
-                                                     &aom_get4x4sse_cs_msa)));
+                        ::testing::Values(SseParams(2, 2,
+                                                    &aom_get4x4sse_cs_msa)));
 
 INSTANTIATE_TEST_CASE_P(MSA, AvxMseTest,
-                        ::testing::Values(make_tuple(4, 4, &aom_mse16x16_msa),
-                                          make_tuple(4, 3, &aom_mse16x8_msa),
-                                          make_tuple(3, 4, &aom_mse8x16_msa),
-                                          make_tuple(3, 3, &aom_mse8x8_msa)));
+                        ::testing::Values(MseParams(4, 4, &aom_mse16x16_msa),
+                                          MseParams(4, 3, &aom_mse16x8_msa),
+                                          MseParams(3, 4, &aom_mse8x16_msa),
+                                          MseParams(3, 3, &aom_mse8x8_msa)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, AvxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &aom_variance64x64_msa, 0),
-                      make_tuple(6, 5, &aom_variance64x32_msa, 0),
-                      make_tuple(5, 6, &aom_variance32x64_msa, 0),
-                      make_tuple(5, 5, &aom_variance32x32_msa, 0),
-                      make_tuple(5, 4, &aom_variance32x16_msa, 0),
-                      make_tuple(4, 5, &aom_variance16x32_msa, 0),
-                      make_tuple(4, 4, &aom_variance16x16_msa, 0),
-                      make_tuple(4, 3, &aom_variance16x8_msa, 0),
-                      make_tuple(3, 4, &aom_variance8x16_msa, 0),
-                      make_tuple(3, 3, &aom_variance8x8_msa, 0),
-                      make_tuple(3, 2, &aom_variance8x4_msa, 0),
-                      make_tuple(2, 3, &aom_variance4x8_msa, 0),
-                      make_tuple(2, 2, &aom_variance4x4_msa, 0)));
+    ::testing::Values(VarianceParams(6, 6, &aom_variance64x64_msa),
+                      VarianceParams(6, 5, &aom_variance64x32_msa),
+                      VarianceParams(5, 6, &aom_variance32x64_msa),
+                      VarianceParams(5, 5, &aom_variance32x32_msa),
+                      VarianceParams(5, 4, &aom_variance32x16_msa),
+                      VarianceParams(4, 5, &aom_variance16x32_msa),
+                      VarianceParams(4, 4, &aom_variance16x16_msa),
+                      VarianceParams(4, 3, &aom_variance16x8_msa),
+                      VarianceParams(3, 4, &aom_variance8x16_msa),
+                      VarianceParams(3, 3, &aom_variance8x8_msa),
+                      VarianceParams(3, 2, &aom_variance8x4_msa),
+                      VarianceParams(2, 3, &aom_variance4x8_msa),
+                      VarianceParams(2, 2, &aom_variance4x4_msa)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, AvxSubpelVarianceTest,

diff --git a/third_party/fastfeat/LICENSE b/third_party/fastfeat/LICENSE
new file mode 100644
index 0000000..f347008
--- /dev/null
+++ b/third_party/fastfeat/LICENSE

@@ -0,0 +1,30 @@
+Copyright (c) 2006, 2008 Edward Rosten
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+
+	*Redistributions of source code must retain the above copyright
+	 notice, this list of conditions and the following disclaimer.
+
+	*Redistributions in binary form must reproduce the above copyright
+	 notice, this list of conditions and the following disclaimer in the
+	 documentation and/or other materials provided with the distribution.
+
+	*Neither the name of the University of Cambridge nor the names of 
+	 its contributors may be used to endorse or promote products derived 
+	 from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

diff --git a/third_party/fastfeat/README.libvpx b/third_party/fastfeat/README.libvpx
new file mode 100644
index 0000000..2edd6e7
--- /dev/null
+++ b/third_party/fastfeat/README.libvpx

@@ -0,0 +1,38 @@
+URL: https://github.com/edrosten/fast-C-src
+Version: 391d5e939eb1545d24c10533d7de424db8d9c191
+License: BSD
+License File: LICENSE
+
+Description:
+Library to compute FAST features with non-maximum suppression.
+
+The files are valid C and C++ code, and have no special requirements for
+compiling, and they do not depend on any libraries. Just compile them along with
+the rest of your project.
+
+To use the functions, #include "fast.h"
+
+The corner detectors have the following prototype (where X is 9, 10, 11 or 12):
+
+xy* fastX_detect_nonmax(const unsigned char * data, int xsize, int ysize, int stride, int threshold, int* numcorners)
+
+Where xy is the following simple struct typedef:
+
+typedef struct
+{
+	int x, y;
+} xy;
+
+The image is passed in as a block of data and dimensions, and the list of
+corners is returned as an array of xy structs, and an integer (numcorners)
+with the number of corners returned.  The data can be deallocated with free().
+Nonmaximal suppression is performed on the corners. Note that the stride
+is the number of bytes between rows. If your image has no padding, then this
+is the same as xsize.
+
+The detection, scoring and nonmaximal suppression are available as individual
+functions.  To see how to use the individual functions, see fast.c
+
+Local Modifications:
+Add lines to turn off clang formatting for these files
+Remove Fast 10, 11 and 12

diff --git a/third_party/fastfeat/fast.c b/third_party/fastfeat/fast.c
new file mode 100644
index 0000000..0d7efc1
--- /dev/null
+++ b/third_party/fastfeat/fast.c

@@ -0,0 +1,22 @@
+// clang-format off
+#include <stdlib.h>
+#include "fast.h"
+
+
+xy* fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+{
+	xy* corners;
+	int num_corners;
+	int* scores;
+	xy* nonmax;
+
+	corners = fast9_detect(im, xsize, ysize, stride, b, &num_corners);
+	scores = fast9_score(im, stride, corners, num_corners, b);
+	nonmax = nonmax_suppression(corners, scores, num_corners, ret_num_corners);
+
+	free(corners);
+	free(scores);
+
+	return nonmax;
+}
+// clang-format on

diff --git a/third_party/fastfeat/fast.h b/third_party/fastfeat/fast.h
new file mode 100644
index 0000000..a00730e
--- /dev/null
+++ b/third_party/fastfeat/fast.h

@@ -0,0 +1,20 @@
+// clang-format off
+#ifndef FAST_H
+#define FAST_H
+
+typedef struct { int x, y; } xy;
+typedef unsigned char byte;
+
+int fast9_corner_score(const byte* p, const int pixel[], int bstart);
+
+xy* fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+
+int* fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b);
+
+xy* fast9_detect_nonmax(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners);
+
+xy* nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax);
+
+
+#endif
+// clang-format on

diff --git a/third_party/fastfeat/fast_9.c b/third_party/fastfeat/fast_9.c
new file mode 100644
index 0000000..36aee19
--- /dev/null
+++ b/third_party/fastfeat/fast_9.c

@@ -0,0 +1,5911 @@
+// clang-format off
+/*This is mechanically generated code*/
+#include <stdlib.h>
+
+typedef struct { int x, y; } xy;
+typedef unsigned char byte;
+
+int fast9_corner_score(const byte* p, const int pixel[], int bstart)
+{
+    int bmin = bstart;
+    int bmax = 255;
+    int b = (bmax + bmin)/2;
+
+    /*Compute the score using binary search*/
+	for(;;)
+    {
+		int cb = *p + b;
+		int c_b= *p - b;
+
+
+        if( p[pixel[0]] > cb)
+         if( p[pixel[1]] > cb)
+          if( p[pixel[2]] > cb)
+           if( p[pixel[3]] > cb)
+            if( p[pixel[4]] > cb)
+             if( p[pixel[5]] > cb)
+              if( p[pixel[6]] > cb)
+               if( p[pixel[7]] > cb)
+                if( p[pixel[8]] > cb)
+                 goto is_a_corner;
+                else
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else if( p[pixel[7]] < c_b)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else if( p[pixel[14]] < c_b)
+                 if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                   if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                     if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                       if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else if( p[pixel[6]] < c_b)
+               if( p[pixel[15]] > cb)
+                if( p[pixel[13]] > cb)
+                 if( p[pixel[14]] > cb)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else if( p[pixel[13]] < c_b)
+                 if( p[pixel[7]] < c_b)
+                  if( p[pixel[8]] < c_b)
+                   if( p[pixel[9]] < c_b)
+                    if( p[pixel[10]] < c_b)
+                     if( p[pixel[11]] < c_b)
+                      if( p[pixel[12]] < c_b)
+                       if( p[pixel[14]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                if( p[pixel[7]] < c_b)
+                 if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                   if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                     if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                       if( p[pixel[14]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else if( p[pixel[13]] < c_b)
+                if( p[pixel[7]] < c_b)
+                 if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                   if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                     if( p[pixel[12]] < c_b)
+                      if( p[pixel[14]] < c_b)
+                       if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else if( p[pixel[5]] < c_b)
+              if( p[pixel[14]] > cb)
+               if( p[pixel[12]] > cb)
+                if( p[pixel[13]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                       if( p[pixel[11]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else if( p[pixel[12]] < c_b)
+                if( p[pixel[6]] < c_b)
+                 if( p[pixel[7]] < c_b)
+                  if( p[pixel[8]] < c_b)
+                   if( p[pixel[9]] < c_b)
+                    if( p[pixel[10]] < c_b)
+                     if( p[pixel[11]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else if( p[pixel[14]] < c_b)
+               if( p[pixel[7]] < c_b)
+                if( p[pixel[8]] < c_b)
+                 if( p[pixel[9]] < c_b)
+                  if( p[pixel[10]] < c_b)
+                   if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                     if( p[pixel[13]] < c_b)
+                      if( p[pixel[6]] < c_b)
+                       goto is_a_corner;
+                      else
+                       if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               if( p[pixel[6]] < c_b)
+                if( p[pixel[7]] < c_b)
+                 if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                   if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                     if( p[pixel[12]] < c_b)
+                      if( p[pixel[13]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                       if( p[pixel[11]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else if( p[pixel[12]] < c_b)
+               if( p[pixel[7]] < c_b)
+                if( p[pixel[8]] < c_b)
+                 if( p[pixel[9]] < c_b)
+                  if( p[pixel[10]] < c_b)
+                   if( p[pixel[11]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                     if( p[pixel[14]] < c_b)
+                      if( p[pixel[6]] < c_b)
+                       goto is_a_corner;
+                      else
+                       if( p[pixel[15]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else if( p[pixel[4]] < c_b)
+             if( p[pixel[13]] > cb)
+              if( p[pixel[11]] > cb)
+               if( p[pixel[12]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else if( p[pixel[11]] < c_b)
+               if( p[pixel[5]] < c_b)
+                if( p[pixel[6]] < c_b)
+                 if( p[pixel[7]] < c_b)
+                  if( p[pixel[8]] < c_b)
+                   if( p[pixel[9]] < c_b)
+                    if( p[pixel[10]] < c_b)
+                     if( p[pixel[12]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else if( p[pixel[13]] < c_b)
+              if( p[pixel[7]] < c_b)
+               if( p[pixel[8]] < c_b)
+                if( p[pixel[9]] < c_b)
+                 if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                   if( p[pixel[12]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                     if( p[pixel[5]] < c_b)
+                      goto is_a_corner;
+                     else
+                      if( p[pixel[14]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                    else
+                     if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              if( p[pixel[5]] < c_b)
+               if( p[pixel[6]] < c_b)
+                if( p[pixel[7]] < c_b)
+                 if( p[pixel[8]] < c_b)
+                  if( p[pixel[9]] < c_b)
+                   if( p[pixel[10]] < c_b)
+                    if( p[pixel[11]] < c_b)
+                     if( p[pixel[12]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             if( p[pixel[11]] > cb)
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      if( p[pixel[10]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else if( p[pixel[11]] < c_b)
+              if( p[pixel[7]] < c_b)
+               if( p[pixel[8]] < c_b)
+                if( p[pixel[9]] < c_b)
+                 if( p[pixel[10]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                   if( p[pixel[13]] < c_b)
+                    if( p[pixel[6]] < c_b)
+                     if( p[pixel[5]] < c_b)
+                      goto is_a_corner;
+                     else
+                      if( p[pixel[14]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                    else
+                     if( p[pixel[14]] < c_b)
+                      if( p[pixel[15]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+           else if( p[pixel[3]] < c_b)
+            if( p[pixel[10]] > cb)
+             if( p[pixel[11]] > cb)
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+             if( p[pixel[7]] < c_b)
+              if( p[pixel[8]] < c_b)
+               if( p[pixel[9]] < c_b)
+                if( p[pixel[11]] < c_b)
+                 if( p[pixel[6]] < c_b)
+                  if( p[pixel[5]] < c_b)
+                   if( p[pixel[4]] < c_b)
+                    goto is_a_corner;
+                   else
+                    if( p[pixel[12]] < c_b)
+                     if( p[pixel[13]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                  else
+                   if( p[pixel[12]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                     if( p[pixel[14]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[12]] < c_b)
+                   if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                     if( p[pixel[15]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            if( p[pixel[10]] > cb)
+             if( p[pixel[11]] > cb)
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     if( p[pixel[9]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+             if( p[pixel[7]] < c_b)
+              if( p[pixel[8]] < c_b)
+               if( p[pixel[9]] < c_b)
+                if( p[pixel[11]] < c_b)
+                 if( p[pixel[12]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[5]] < c_b)
+                    if( p[pixel[4]] < c_b)
+                     goto is_a_corner;
+                    else
+                     if( p[pixel[13]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                   else
+                    if( p[pixel[13]] < c_b)
+                     if( p[pixel[14]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                  else
+                   if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                     if( p[pixel[15]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+          else if( p[pixel[2]] < c_b)
+           if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+             if( p[pixel[11]] > cb)
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] > cb)
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else if( p[pixel[9]] < c_b)
+            if( p[pixel[7]] < c_b)
+             if( p[pixel[8]] < c_b)
+              if( p[pixel[10]] < c_b)
+               if( p[pixel[6]] < c_b)
+                if( p[pixel[5]] < c_b)
+                 if( p[pixel[4]] < c_b)
+                  if( p[pixel[3]] < c_b)
+                   goto is_a_corner;
+                  else
+                   if( p[pixel[11]] < c_b)
+                    if( p[pixel[12]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[11]] < c_b)
+                   if( p[pixel[12]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                   if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[11]] < c_b)
+                 if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                   if( p[pixel[14]] < c_b)
+                    if( p[pixel[15]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else
+           if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+             if( p[pixel[11]] > cb)
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] > cb)
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    if( p[pixel[8]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else if( p[pixel[9]] < c_b)
+            if( p[pixel[7]] < c_b)
+             if( p[pixel[8]] < c_b)
+              if( p[pixel[10]] < c_b)
+               if( p[pixel[11]] < c_b)
+                if( p[pixel[6]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[4]] < c_b)
+                   if( p[pixel[3]] < c_b)
+                    goto is_a_corner;
+                   else
+                    if( p[pixel[12]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                  else
+                   if( p[pixel[12]] < c_b)
+                    if( p[pixel[13]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[12]] < c_b)
+                   if( p[pixel[13]] < c_b)
+                    if( p[pixel[14]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                   if( p[pixel[14]] < c_b)
+                    if( p[pixel[15]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+         else if( p[pixel[1]] < c_b)
+          if( p[pixel[8]] > cb)
+           if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+             if( p[pixel[11]] > cb)
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] > cb)
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[2]] > cb)
+               if( p[pixel[3]] > cb)
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else if( p[pixel[8]] < c_b)
+           if( p[pixel[7]] < c_b)
+            if( p[pixel[9]] < c_b)
+             if( p[pixel[6]] < c_b)
+              if( p[pixel[5]] < c_b)
+               if( p[pixel[4]] < c_b)
+                if( p[pixel[3]] < c_b)
+                 if( p[pixel[2]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[10]] < c_b)
+                   if( p[pixel[11]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                   if( p[pixel[12]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[10]] < c_b)
+                 if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                   if( p[pixel[13]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[10]] < c_b)
+                if( p[pixel[11]] < c_b)
+                 if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                   if( p[pixel[14]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[10]] < c_b)
+               if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                 if( p[pixel[13]] < c_b)
+                  if( p[pixel[14]] < c_b)
+                   if( p[pixel[15]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else
+           goto is_not_a_corner;
+         else
+          if( p[pixel[8]] > cb)
+           if( p[pixel[9]] > cb)
+            if( p[pixel[10]] > cb)
+             if( p[pixel[11]] > cb)
+              if( p[pixel[12]] > cb)
+               if( p[pixel[13]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[15]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] > cb)
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[2]] > cb)
+               if( p[pixel[3]] > cb)
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[7]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else if( p[pixel[8]] < c_b)
+           if( p[pixel[7]] < c_b)
+            if( p[pixel[9]] < c_b)
+             if( p[pixel[10]] < c_b)
+              if( p[pixel[6]] < c_b)
+               if( p[pixel[5]] < c_b)
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[3]] < c_b)
+                  if( p[pixel[2]] < c_b)
+                   goto is_a_corner;
+                  else
+                   if( p[pixel[11]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[11]] < c_b)
+                   if( p[pixel[12]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                   if( p[pixel[13]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[11]] < c_b)
+                 if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                   if( p[pixel[14]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                 if( p[pixel[13]] < c_b)
+                  if( p[pixel[14]] < c_b)
+                   if( p[pixel[15]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else
+           goto is_not_a_corner;
+        else if( p[pixel[0]] < c_b)
+         if( p[pixel[1]] > cb)
+          if( p[pixel[8]] > cb)
+           if( p[pixel[7]] > cb)
+            if( p[pixel[9]] > cb)
+             if( p[pixel[6]] > cb)
+              if( p[pixel[5]] > cb)
+               if( p[pixel[4]] > cb)
+                if( p[pixel[3]] > cb)
+                 if( p[pixel[2]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[10]] > cb)
+                   if( p[pixel[11]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                   if( p[pixel[12]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[10]] > cb)
+                 if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                   if( p[pixel[13]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[10]] > cb)
+                if( p[pixel[11]] > cb)
+                 if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                   if( p[pixel[14]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[10]] > cb)
+               if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                 if( p[pixel[13]] > cb)
+                  if( p[pixel[14]] > cb)
+                   if( p[pixel[15]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else if( p[pixel[8]] < c_b)
+           if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+             if( p[pixel[11]] < c_b)
+              if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] < c_b)
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[2]] < c_b)
+               if( p[pixel[3]] < c_b)
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else
+           goto is_not_a_corner;
+         else if( p[pixel[1]] < c_b)
+          if( p[pixel[2]] > cb)
+           if( p[pixel[9]] > cb)
+            if( p[pixel[7]] > cb)
+             if( p[pixel[8]] > cb)
+              if( p[pixel[10]] > cb)
+               if( p[pixel[6]] > cb)
+                if( p[pixel[5]] > cb)
+                 if( p[pixel[4]] > cb)
+                  if( p[pixel[3]] > cb)
+                   goto is_a_corner;
+                  else
+                   if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[11]] > cb)
+                   if( p[pixel[12]] > cb)
+                    if( p[pixel[13]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                   if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[11]] > cb)
+                 if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                   if( p[pixel[14]] > cb)
+                    if( p[pixel[15]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+             if( p[pixel[11]] < c_b)
+              if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] < c_b)
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else if( p[pixel[2]] < c_b)
+           if( p[pixel[3]] > cb)
+            if( p[pixel[10]] > cb)
+             if( p[pixel[7]] > cb)
+              if( p[pixel[8]] > cb)
+               if( p[pixel[9]] > cb)
+                if( p[pixel[11]] > cb)
+                 if( p[pixel[6]] > cb)
+                  if( p[pixel[5]] > cb)
+                   if( p[pixel[4]] > cb)
+                    goto is_a_corner;
+                   else
+                    if( p[pixel[12]] > cb)
+                     if( p[pixel[13]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                  else
+                   if( p[pixel[12]] > cb)
+                    if( p[pixel[13]] > cb)
+                     if( p[pixel[14]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[12]] > cb)
+                   if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                     if( p[pixel[15]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+             if( p[pixel[11]] < c_b)
+              if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else if( p[pixel[3]] < c_b)
+            if( p[pixel[4]] > cb)
+             if( p[pixel[13]] > cb)
+              if( p[pixel[7]] > cb)
+               if( p[pixel[8]] > cb)
+                if( p[pixel[9]] > cb)
+                 if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                   if( p[pixel[12]] > cb)
+                    if( p[pixel[6]] > cb)
+                     if( p[pixel[5]] > cb)
+                      goto is_a_corner;
+                     else
+                      if( p[pixel[14]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                    else
+                     if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else if( p[pixel[13]] < c_b)
+              if( p[pixel[11]] > cb)
+               if( p[pixel[5]] > cb)
+                if( p[pixel[6]] > cb)
+                 if( p[pixel[7]] > cb)
+                  if( p[pixel[8]] > cb)
+                   if( p[pixel[9]] > cb)
+                    if( p[pixel[10]] > cb)
+                     if( p[pixel[12]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else if( p[pixel[11]] < c_b)
+               if( p[pixel[12]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              if( p[pixel[5]] > cb)
+               if( p[pixel[6]] > cb)
+                if( p[pixel[7]] > cb)
+                 if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                   if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                     if( p[pixel[12]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else if( p[pixel[4]] < c_b)
+             if( p[pixel[5]] > cb)
+              if( p[pixel[14]] > cb)
+               if( p[pixel[7]] > cb)
+                if( p[pixel[8]] > cb)
+                 if( p[pixel[9]] > cb)
+                  if( p[pixel[10]] > cb)
+                   if( p[pixel[11]] > cb)
+                    if( p[pixel[12]] > cb)
+                     if( p[pixel[13]] > cb)
+                      if( p[pixel[6]] > cb)
+                       goto is_a_corner;
+                      else
+                       if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else if( p[pixel[14]] < c_b)
+               if( p[pixel[12]] > cb)
+                if( p[pixel[6]] > cb)
+                 if( p[pixel[7]] > cb)
+                  if( p[pixel[8]] > cb)
+                   if( p[pixel[9]] > cb)
+                    if( p[pixel[10]] > cb)
+                     if( p[pixel[11]] > cb)
+                      if( p[pixel[13]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else if( p[pixel[12]] < c_b)
+                if( p[pixel[13]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                       if( p[pixel[11]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               if( p[pixel[6]] > cb)
+                if( p[pixel[7]] > cb)
+                 if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                   if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                     if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else if( p[pixel[5]] < c_b)
+              if( p[pixel[6]] > cb)
+               if( p[pixel[15]] < c_b)
+                if( p[pixel[13]] > cb)
+                 if( p[pixel[7]] > cb)
+                  if( p[pixel[8]] > cb)
+                   if( p[pixel[9]] > cb)
+                    if( p[pixel[10]] > cb)
+                     if( p[pixel[11]] > cb)
+                      if( p[pixel[12]] > cb)
+                       if( p[pixel[14]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else if( p[pixel[13]] < c_b)
+                 if( p[pixel[14]] < c_b)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                if( p[pixel[7]] > cb)
+                 if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                   if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                     if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                       if( p[pixel[14]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else if( p[pixel[6]] < c_b)
+               if( p[pixel[7]] > cb)
+                if( p[pixel[14]] > cb)
+                 if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                   if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                     if( p[pixel[12]] > cb)
+                      if( p[pixel[13]] > cb)
+                       if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else if( p[pixel[7]] < c_b)
+                if( p[pixel[8]] < c_b)
+                 goto is_a_corner;
+                else
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[13]] > cb)
+                if( p[pixel[7]] > cb)
+                 if( p[pixel[8]] > cb)
+                  if( p[pixel[9]] > cb)
+                   if( p[pixel[10]] > cb)
+                    if( p[pixel[11]] > cb)
+                     if( p[pixel[12]] > cb)
+                      if( p[pixel[14]] > cb)
+                       if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[12]] > cb)
+               if( p[pixel[7]] > cb)
+                if( p[pixel[8]] > cb)
+                 if( p[pixel[9]] > cb)
+                  if( p[pixel[10]] > cb)
+                   if( p[pixel[11]] > cb)
+                    if( p[pixel[13]] > cb)
+                     if( p[pixel[14]] > cb)
+                      if( p[pixel[6]] > cb)
+                       goto is_a_corner;
+                      else
+                       if( p[pixel[15]] > cb)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                       if( p[pixel[11]] < c_b)
+                        goto is_a_corner;
+                       else
+                        goto is_not_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             if( p[pixel[11]] > cb)
+              if( p[pixel[7]] > cb)
+               if( p[pixel[8]] > cb)
+                if( p[pixel[9]] > cb)
+                 if( p[pixel[10]] > cb)
+                  if( p[pixel[12]] > cb)
+                   if( p[pixel[13]] > cb)
+                    if( p[pixel[6]] > cb)
+                     if( p[pixel[5]] > cb)
+                      goto is_a_corner;
+                     else
+                      if( p[pixel[14]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                    else
+                     if( p[pixel[14]] > cb)
+                      if( p[pixel[15]] > cb)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else if( p[pixel[11]] < c_b)
+              if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      if( p[pixel[10]] < c_b)
+                       goto is_a_corner;
+                      else
+                       goto is_not_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+           else
+            if( p[pixel[10]] > cb)
+             if( p[pixel[7]] > cb)
+              if( p[pixel[8]] > cb)
+               if( p[pixel[9]] > cb)
+                if( p[pixel[11]] > cb)
+                 if( p[pixel[12]] > cb)
+                  if( p[pixel[6]] > cb)
+                   if( p[pixel[5]] > cb)
+                    if( p[pixel[4]] > cb)
+                     goto is_a_corner;
+                    else
+                     if( p[pixel[13]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                   else
+                    if( p[pixel[13]] > cb)
+                     if( p[pixel[14]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                  else
+                   if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                     if( p[pixel[15]] > cb)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else if( p[pixel[10]] < c_b)
+             if( p[pixel[11]] < c_b)
+              if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     if( p[pixel[9]] < c_b)
+                      goto is_a_corner;
+                     else
+                      goto is_not_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+          else
+           if( p[pixel[9]] > cb)
+            if( p[pixel[7]] > cb)
+             if( p[pixel[8]] > cb)
+              if( p[pixel[10]] > cb)
+               if( p[pixel[11]] > cb)
+                if( p[pixel[6]] > cb)
+                 if( p[pixel[5]] > cb)
+                  if( p[pixel[4]] > cb)
+                   if( p[pixel[3]] > cb)
+                    goto is_a_corner;
+                   else
+                    if( p[pixel[12]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                  else
+                   if( p[pixel[12]] > cb)
+                    if( p[pixel[13]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[12]] > cb)
+                   if( p[pixel[13]] > cb)
+                    if( p[pixel[14]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                   if( p[pixel[14]] > cb)
+                    if( p[pixel[15]] > cb)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+             if( p[pixel[11]] < c_b)
+              if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] < c_b)
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    if( p[pixel[8]] < c_b)
+                     goto is_a_corner;
+                    else
+                     goto is_not_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+         else
+          if( p[pixel[8]] > cb)
+           if( p[pixel[7]] > cb)
+            if( p[pixel[9]] > cb)
+             if( p[pixel[10]] > cb)
+              if( p[pixel[6]] > cb)
+               if( p[pixel[5]] > cb)
+                if( p[pixel[4]] > cb)
+                 if( p[pixel[3]] > cb)
+                  if( p[pixel[2]] > cb)
+                   goto is_a_corner;
+                  else
+                   if( p[pixel[11]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                 else
+                  if( p[pixel[11]] > cb)
+                   if( p[pixel[12]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                   if( p[pixel[13]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[11]] > cb)
+                 if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                   if( p[pixel[14]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                 if( p[pixel[13]] > cb)
+                  if( p[pixel[14]] > cb)
+                   if( p[pixel[15]] > cb)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else if( p[pixel[8]] < c_b)
+           if( p[pixel[9]] < c_b)
+            if( p[pixel[10]] < c_b)
+             if( p[pixel[11]] < c_b)
+              if( p[pixel[12]] < c_b)
+               if( p[pixel[13]] < c_b)
+                if( p[pixel[14]] < c_b)
+                 if( p[pixel[15]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[3]] < c_b)
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[2]] < c_b)
+               if( p[pixel[3]] < c_b)
+                if( p[pixel[4]] < c_b)
+                 if( p[pixel[5]] < c_b)
+                  if( p[pixel[6]] < c_b)
+                   if( p[pixel[7]] < c_b)
+                    goto is_a_corner;
+                   else
+                    goto is_not_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else
+           goto is_not_a_corner;
+        else
+         if( p[pixel[7]] > cb)
+          if( p[pixel[8]] > cb)
+           if( p[pixel[9]] > cb)
+            if( p[pixel[6]] > cb)
+             if( p[pixel[5]] > cb)
+              if( p[pixel[4]] > cb)
+               if( p[pixel[3]] > cb)
+                if( p[pixel[2]] > cb)
+                 if( p[pixel[1]] > cb)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[10]] > cb)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[10]] > cb)
+                  if( p[pixel[11]] > cb)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[10]] > cb)
+                 if( p[pixel[11]] > cb)
+                  if( p[pixel[12]] > cb)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[10]] > cb)
+                if( p[pixel[11]] > cb)
+                 if( p[pixel[12]] > cb)
+                  if( p[pixel[13]] > cb)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[10]] > cb)
+               if( p[pixel[11]] > cb)
+                if( p[pixel[12]] > cb)
+                 if( p[pixel[13]] > cb)
+                  if( p[pixel[14]] > cb)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             if( p[pixel[10]] > cb)
+              if( p[pixel[11]] > cb)
+               if( p[pixel[12]] > cb)
+                if( p[pixel[13]] > cb)
+                 if( p[pixel[14]] > cb)
+                  if( p[pixel[15]] > cb)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else
+           goto is_not_a_corner;
+         else if( p[pixel[7]] < c_b)
+          if( p[pixel[8]] < c_b)
+           if( p[pixel[9]] < c_b)
+            if( p[pixel[6]] < c_b)
+             if( p[pixel[5]] < c_b)
+              if( p[pixel[4]] < c_b)
+               if( p[pixel[3]] < c_b)
+                if( p[pixel[2]] < c_b)
+                 if( p[pixel[1]] < c_b)
+                  goto is_a_corner;
+                 else
+                  if( p[pixel[10]] < c_b)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                else
+                 if( p[pixel[10]] < c_b)
+                  if( p[pixel[11]] < c_b)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+               else
+                if( p[pixel[10]] < c_b)
+                 if( p[pixel[11]] < c_b)
+                  if( p[pixel[12]] < c_b)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+              else
+               if( p[pixel[10]] < c_b)
+                if( p[pixel[11]] < c_b)
+                 if( p[pixel[12]] < c_b)
+                  if( p[pixel[13]] < c_b)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+             else
+              if( p[pixel[10]] < c_b)
+               if( p[pixel[11]] < c_b)
+                if( p[pixel[12]] < c_b)
+                 if( p[pixel[13]] < c_b)
+                  if( p[pixel[14]] < c_b)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+            else
+             if( p[pixel[10]] < c_b)
+              if( p[pixel[11]] < c_b)
+               if( p[pixel[12]] < c_b)
+                if( p[pixel[13]] < c_b)
+                 if( p[pixel[14]] < c_b)
+                  if( p[pixel[15]] < c_b)
+                   goto is_a_corner;
+                  else
+                   goto is_not_a_corner;
+                 else
+                  goto is_not_a_corner;
+                else
+                 goto is_not_a_corner;
+               else
+                goto is_not_a_corner;
+              else
+               goto is_not_a_corner;
+             else
+              goto is_not_a_corner;
+           else
+            goto is_not_a_corner;
+          else
+           goto is_not_a_corner;
+         else
+          goto is_not_a_corner;
+
+		is_a_corner:
+			bmin=b;
+			goto end_if;
+
+		is_not_a_corner:
+			bmax=b;
+			goto end_if;
+
+		end_if:
+
+		if(bmin == bmax - 1 || bmin == bmax)
+			return bmin;
+		b = (bmin + bmax) / 2;
+    }
+}
+
+static void make_offsets(int pixel[], int row_stride)
+{
+        pixel[0] = 0 + row_stride * 3;
+        pixel[1] = 1 + row_stride * 3;
+        pixel[2] = 2 + row_stride * 2;
+        pixel[3] = 3 + row_stride * 1;
+        pixel[4] = 3 + row_stride * 0;
+        pixel[5] = 3 + row_stride * -1;
+        pixel[6] = 2 + row_stride * -2;
+        pixel[7] = 1 + row_stride * -3;
+        pixel[8] = 0 + row_stride * -3;
+        pixel[9] = -1 + row_stride * -3;
+        pixel[10] = -2 + row_stride * -2;
+        pixel[11] = -3 + row_stride * -1;
+        pixel[12] = -3 + row_stride * 0;
+        pixel[13] = -3 + row_stride * 1;
+        pixel[14] = -2 + row_stride * 2;
+        pixel[15] = -1 + row_stride * 3;
+}
+
+
+
+int* fast9_score(const byte* i, int stride, xy* corners, int num_corners, int b)
+{
+	int* scores = (int*)malloc(sizeof(int)* num_corners);
+	int n;
+
+	int pixel[16];
+	make_offsets(pixel, stride);
+
+    for(n=0; n < num_corners; n++)
+        scores[n] = fast9_corner_score(i + corners[n].y*stride + corners[n].x, pixel, b);
+
+	return scores;
+}
+
+
+xy* fast9_detect(const byte* im, int xsize, int ysize, int stride, int b, int* ret_num_corners)
+{
+	int num_corners=0;
+	xy* ret_corners;
+	int rsize=512;
+	int pixel[16];
+	int x, y;
+
+	ret_corners = (xy*)malloc(sizeof(xy)*rsize);
+	make_offsets(pixel, stride);
+
+	for(y=3; y < ysize - 3; y++)
+		for(x=3; x < xsize - 3; x++)
+		{
+			const byte* p = im + y*stride + x;
+
+			int cb = *p + b;
+			int c_b= *p - b;
+        if(p[pixel[0]] > cb)
+         if(p[pixel[1]] > cb)
+          if(p[pixel[2]] > cb)
+           if(p[pixel[3]] > cb)
+            if(p[pixel[4]] > cb)
+             if(p[pixel[5]] > cb)
+              if(p[pixel[6]] > cb)
+               if(p[pixel[7]] > cb)
+                if(p[pixel[8]] > cb)
+                 {}
+                else
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  continue;
+               else if(p[pixel[7]] < c_b)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  continue;
+                else if(p[pixel[14]] < c_b)
+                 if(p[pixel[8]] < c_b)
+                  if(p[pixel[9]] < c_b)
+                   if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                     if(p[pixel[12]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                       if(p[pixel[15]] < c_b)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  continue;
+                else
+                 continue;
+              else if(p[pixel[6]] < c_b)
+               if(p[pixel[15]] > cb)
+                if(p[pixel[13]] > cb)
+                 if(p[pixel[14]] > cb)
+                  {}
+                 else
+                  continue;
+                else if(p[pixel[13]] < c_b)
+                 if(p[pixel[7]] < c_b)
+                  if(p[pixel[8]] < c_b)
+                   if(p[pixel[9]] < c_b)
+                    if(p[pixel[10]] < c_b)
+                     if(p[pixel[11]] < c_b)
+                      if(p[pixel[12]] < c_b)
+                       if(p[pixel[14]] < c_b)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                if(p[pixel[7]] < c_b)
+                 if(p[pixel[8]] < c_b)
+                  if(p[pixel[9]] < c_b)
+                   if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                     if(p[pixel[12]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                       if(p[pixel[14]] < c_b)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  continue;
+                else
+                 continue;
+               else if(p[pixel[13]] < c_b)
+                if(p[pixel[7]] < c_b)
+                 if(p[pixel[8]] < c_b)
+                  if(p[pixel[9]] < c_b)
+                   if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                     if(p[pixel[12]] < c_b)
+                      if(p[pixel[14]] < c_b)
+                       if(p[pixel[15]] < c_b)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else if(p[pixel[5]] < c_b)
+              if(p[pixel[14]] > cb)
+               if(p[pixel[12]] > cb)
+                if(p[pixel[13]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      if(p[pixel[10]] > cb)
+                       if(p[pixel[11]] > cb)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 continue;
+               else if(p[pixel[12]] < c_b)
+                if(p[pixel[6]] < c_b)
+                 if(p[pixel[7]] < c_b)
+                  if(p[pixel[8]] < c_b)
+                   if(p[pixel[9]] < c_b)
+                    if(p[pixel[10]] < c_b)
+                     if(p[pixel[11]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else if(p[pixel[14]] < c_b)
+               if(p[pixel[7]] < c_b)
+                if(p[pixel[8]] < c_b)
+                 if(p[pixel[9]] < c_b)
+                  if(p[pixel[10]] < c_b)
+                   if(p[pixel[11]] < c_b)
+                    if(p[pixel[12]] < c_b)
+                     if(p[pixel[13]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                       {}
+                      else
+                       if(p[pixel[15]] < c_b)
+                        {}
+                       else
+                        continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               if(p[pixel[6]] < c_b)
+                if(p[pixel[7]] < c_b)
+                 if(p[pixel[8]] < c_b)
+                  if(p[pixel[9]] < c_b)
+                   if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                     if(p[pixel[12]] < c_b)
+                      if(p[pixel[13]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      if(p[pixel[10]] > cb)
+                       if(p[pixel[11]] > cb)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 continue;
+               else
+                continue;
+              else if(p[pixel[12]] < c_b)
+               if(p[pixel[7]] < c_b)
+                if(p[pixel[8]] < c_b)
+                 if(p[pixel[9]] < c_b)
+                  if(p[pixel[10]] < c_b)
+                   if(p[pixel[11]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                     if(p[pixel[14]] < c_b)
+                      if(p[pixel[6]] < c_b)
+                       {}
+                      else
+                       if(p[pixel[15]] < c_b)
+                        {}
+                       else
+                        continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else if(p[pixel[4]] < c_b)
+             if(p[pixel[13]] > cb)
+              if(p[pixel[11]] > cb)
+               if(p[pixel[12]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      if(p[pixel[10]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      if(p[pixel[10]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                continue;
+              else if(p[pixel[11]] < c_b)
+               if(p[pixel[5]] < c_b)
+                if(p[pixel[6]] < c_b)
+                 if(p[pixel[7]] < c_b)
+                  if(p[pixel[8]] < c_b)
+                   if(p[pixel[9]] < c_b)
+                    if(p[pixel[10]] < c_b)
+                     if(p[pixel[12]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else if(p[pixel[13]] < c_b)
+              if(p[pixel[7]] < c_b)
+               if(p[pixel[8]] < c_b)
+                if(p[pixel[9]] < c_b)
+                 if(p[pixel[10]] < c_b)
+                  if(p[pixel[11]] < c_b)
+                   if(p[pixel[12]] < c_b)
+                    if(p[pixel[6]] < c_b)
+                     if(p[pixel[5]] < c_b)
+                      {}
+                     else
+                      if(p[pixel[14]] < c_b)
+                       {}
+                      else
+                       continue;
+                    else
+                     if(p[pixel[14]] < c_b)
+                      if(p[pixel[15]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              if(p[pixel[5]] < c_b)
+               if(p[pixel[6]] < c_b)
+                if(p[pixel[7]] < c_b)
+                 if(p[pixel[8]] < c_b)
+                  if(p[pixel[9]] < c_b)
+                   if(p[pixel[10]] < c_b)
+                    if(p[pixel[11]] < c_b)
+                     if(p[pixel[12]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             if(p[pixel[11]] > cb)
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      if(p[pixel[10]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      if(p[pixel[10]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                continue;
+              else
+               continue;
+             else if(p[pixel[11]] < c_b)
+              if(p[pixel[7]] < c_b)
+               if(p[pixel[8]] < c_b)
+                if(p[pixel[9]] < c_b)
+                 if(p[pixel[10]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                   if(p[pixel[13]] < c_b)
+                    if(p[pixel[6]] < c_b)
+                     if(p[pixel[5]] < c_b)
+                      {}
+                     else
+                      if(p[pixel[14]] < c_b)
+                       {}
+                      else
+                       continue;
+                    else
+                     if(p[pixel[14]] < c_b)
+                      if(p[pixel[15]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+           else if(p[pixel[3]] < c_b)
+            if(p[pixel[10]] > cb)
+             if(p[pixel[11]] > cb)
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               continue;
+             else
+              continue;
+            else if(p[pixel[10]] < c_b)
+             if(p[pixel[7]] < c_b)
+              if(p[pixel[8]] < c_b)
+               if(p[pixel[9]] < c_b)
+                if(p[pixel[11]] < c_b)
+                 if(p[pixel[6]] < c_b)
+                  if(p[pixel[5]] < c_b)
+                   if(p[pixel[4]] < c_b)
+                    {}
+                   else
+                    if(p[pixel[12]] < c_b)
+                     if(p[pixel[13]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                  else
+                   if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                     if(p[pixel[14]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  if(p[pixel[12]] < c_b)
+                   if(p[pixel[13]] < c_b)
+                    if(p[pixel[14]] < c_b)
+                     if(p[pixel[15]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+           else
+            if(p[pixel[10]] > cb)
+             if(p[pixel[11]] > cb)
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     if(p[pixel[9]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               continue;
+             else
+              continue;
+            else if(p[pixel[10]] < c_b)
+             if(p[pixel[7]] < c_b)
+              if(p[pixel[8]] < c_b)
+               if(p[pixel[9]] < c_b)
+                if(p[pixel[11]] < c_b)
+                 if(p[pixel[12]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[5]] < c_b)
+                    if(p[pixel[4]] < c_b)
+                     {}
+                    else
+                     if(p[pixel[13]] < c_b)
+                      {}
+                     else
+                      continue;
+                   else
+                    if(p[pixel[13]] < c_b)
+                     if(p[pixel[14]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                  else
+                   if(p[pixel[13]] < c_b)
+                    if(p[pixel[14]] < c_b)
+                     if(p[pixel[15]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+          else if(p[pixel[2]] < c_b)
+           if(p[pixel[9]] > cb)
+            if(p[pixel[10]] > cb)
+             if(p[pixel[11]] > cb)
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] > cb)
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              continue;
+            else
+             continue;
+           else if(p[pixel[9]] < c_b)
+            if(p[pixel[7]] < c_b)
+             if(p[pixel[8]] < c_b)
+              if(p[pixel[10]] < c_b)
+               if(p[pixel[6]] < c_b)
+                if(p[pixel[5]] < c_b)
+                 if(p[pixel[4]] < c_b)
+                  if(p[pixel[3]] < c_b)
+                   {}
+                  else
+                   if(p[pixel[11]] < c_b)
+                    if(p[pixel[12]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  if(p[pixel[11]] < c_b)
+                   if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                   if(p[pixel[13]] < c_b)
+                    if(p[pixel[14]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[11]] < c_b)
+                 if(p[pixel[12]] < c_b)
+                  if(p[pixel[13]] < c_b)
+                   if(p[pixel[14]] < c_b)
+                    if(p[pixel[15]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+           else
+            continue;
+          else
+           if(p[pixel[9]] > cb)
+            if(p[pixel[10]] > cb)
+             if(p[pixel[11]] > cb)
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] > cb)
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    if(p[pixel[8]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              continue;
+            else
+             continue;
+           else if(p[pixel[9]] < c_b)
+            if(p[pixel[7]] < c_b)
+             if(p[pixel[8]] < c_b)
+              if(p[pixel[10]] < c_b)
+               if(p[pixel[11]] < c_b)
+                if(p[pixel[6]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[4]] < c_b)
+                   if(p[pixel[3]] < c_b)
+                    {}
+                   else
+                    if(p[pixel[12]] < c_b)
+                     {}
+                    else
+                     continue;
+                  else
+                   if(p[pixel[12]] < c_b)
+                    if(p[pixel[13]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  if(p[pixel[12]] < c_b)
+                   if(p[pixel[13]] < c_b)
+                    if(p[pixel[14]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[12]] < c_b)
+                  if(p[pixel[13]] < c_b)
+                   if(p[pixel[14]] < c_b)
+                    if(p[pixel[15]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+           else
+            continue;
+         else if(p[pixel[1]] < c_b)
+          if(p[pixel[8]] > cb)
+           if(p[pixel[9]] > cb)
+            if(p[pixel[10]] > cb)
+             if(p[pixel[11]] > cb)
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] > cb)
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[2]] > cb)
+               if(p[pixel[3]] > cb)
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             continue;
+           else
+            continue;
+          else if(p[pixel[8]] < c_b)
+           if(p[pixel[7]] < c_b)
+            if(p[pixel[9]] < c_b)
+             if(p[pixel[6]] < c_b)
+              if(p[pixel[5]] < c_b)
+               if(p[pixel[4]] < c_b)
+                if(p[pixel[3]] < c_b)
+                 if(p[pixel[2]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[10]] < c_b)
+                   if(p[pixel[11]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[10]] < c_b)
+                  if(p[pixel[11]] < c_b)
+                   if(p[pixel[12]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[10]] < c_b)
+                 if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                   if(p[pixel[13]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                 if(p[pixel[12]] < c_b)
+                  if(p[pixel[13]] < c_b)
+                   if(p[pixel[14]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[10]] < c_b)
+               if(p[pixel[11]] < c_b)
+                if(p[pixel[12]] < c_b)
+                 if(p[pixel[13]] < c_b)
+                  if(p[pixel[14]] < c_b)
+                   if(p[pixel[15]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             continue;
+           else
+            continue;
+          else
+           continue;
+         else
+          if(p[pixel[8]] > cb)
+           if(p[pixel[9]] > cb)
+            if(p[pixel[10]] > cb)
+             if(p[pixel[11]] > cb)
+              if(p[pixel[12]] > cb)
+               if(p[pixel[13]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[15]] > cb)
+                  {}
+                 else
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] > cb)
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[2]] > cb)
+               if(p[pixel[3]] > cb)
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[7]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             continue;
+           else
+            continue;
+          else if(p[pixel[8]] < c_b)
+           if(p[pixel[7]] < c_b)
+            if(p[pixel[9]] < c_b)
+             if(p[pixel[10]] < c_b)
+              if(p[pixel[6]] < c_b)
+               if(p[pixel[5]] < c_b)
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[3]] < c_b)
+                  if(p[pixel[2]] < c_b)
+                   {}
+                  else
+                   if(p[pixel[11]] < c_b)
+                    {}
+                   else
+                    continue;
+                 else
+                  if(p[pixel[11]] < c_b)
+                   if(p[pixel[12]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                   if(p[pixel[13]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[11]] < c_b)
+                 if(p[pixel[12]] < c_b)
+                  if(p[pixel[13]] < c_b)
+                   if(p[pixel[14]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[11]] < c_b)
+                if(p[pixel[12]] < c_b)
+                 if(p[pixel[13]] < c_b)
+                  if(p[pixel[14]] < c_b)
+                   if(p[pixel[15]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              continue;
+            else
+             continue;
+           else
+            continue;
+          else
+           continue;
+        else if(p[pixel[0]] < c_b)
+         if(p[pixel[1]] > cb)
+          if(p[pixel[8]] > cb)
+           if(p[pixel[7]] > cb)
+            if(p[pixel[9]] > cb)
+             if(p[pixel[6]] > cb)
+              if(p[pixel[5]] > cb)
+               if(p[pixel[4]] > cb)
+                if(p[pixel[3]] > cb)
+                 if(p[pixel[2]] > cb)
+                  {}
+                 else
+                  if(p[pixel[10]] > cb)
+                   if(p[pixel[11]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[10]] > cb)
+                  if(p[pixel[11]] > cb)
+                   if(p[pixel[12]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[10]] > cb)
+                 if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                   if(p[pixel[13]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                 if(p[pixel[12]] > cb)
+                  if(p[pixel[13]] > cb)
+                   if(p[pixel[14]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[10]] > cb)
+               if(p[pixel[11]] > cb)
+                if(p[pixel[12]] > cb)
+                 if(p[pixel[13]] > cb)
+                  if(p[pixel[14]] > cb)
+                   if(p[pixel[15]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             continue;
+           else
+            continue;
+          else if(p[pixel[8]] < c_b)
+           if(p[pixel[9]] < c_b)
+            if(p[pixel[10]] < c_b)
+             if(p[pixel[11]] < c_b)
+              if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] < c_b)
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[2]] < c_b)
+               if(p[pixel[3]] < c_b)
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             continue;
+           else
+            continue;
+          else
+           continue;
+         else if(p[pixel[1]] < c_b)
+          if(p[pixel[2]] > cb)
+           if(p[pixel[9]] > cb)
+            if(p[pixel[7]] > cb)
+             if(p[pixel[8]] > cb)
+              if(p[pixel[10]] > cb)
+               if(p[pixel[6]] > cb)
+                if(p[pixel[5]] > cb)
+                 if(p[pixel[4]] > cb)
+                  if(p[pixel[3]] > cb)
+                   {}
+                  else
+                   if(p[pixel[11]] > cb)
+                    if(p[pixel[12]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  if(p[pixel[11]] > cb)
+                   if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                   if(p[pixel[13]] > cb)
+                    if(p[pixel[14]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[11]] > cb)
+                 if(p[pixel[12]] > cb)
+                  if(p[pixel[13]] > cb)
+                   if(p[pixel[14]] > cb)
+                    if(p[pixel[15]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+           else if(p[pixel[9]] < c_b)
+            if(p[pixel[10]] < c_b)
+             if(p[pixel[11]] < c_b)
+              if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] < c_b)
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              continue;
+            else
+             continue;
+           else
+            continue;
+          else if(p[pixel[2]] < c_b)
+           if(p[pixel[3]] > cb)
+            if(p[pixel[10]] > cb)
+             if(p[pixel[7]] > cb)
+              if(p[pixel[8]] > cb)
+               if(p[pixel[9]] > cb)
+                if(p[pixel[11]] > cb)
+                 if(p[pixel[6]] > cb)
+                  if(p[pixel[5]] > cb)
+                   if(p[pixel[4]] > cb)
+                    {}
+                   else
+                    if(p[pixel[12]] > cb)
+                     if(p[pixel[13]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                  else
+                   if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                     if(p[pixel[14]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  if(p[pixel[12]] > cb)
+                   if(p[pixel[13]] > cb)
+                    if(p[pixel[14]] > cb)
+                     if(p[pixel[15]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+            else if(p[pixel[10]] < c_b)
+             if(p[pixel[11]] < c_b)
+              if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+           else if(p[pixel[3]] < c_b)
+            if(p[pixel[4]] > cb)
+             if(p[pixel[13]] > cb)
+              if(p[pixel[7]] > cb)
+               if(p[pixel[8]] > cb)
+                if(p[pixel[9]] > cb)
+                 if(p[pixel[10]] > cb)
+                  if(p[pixel[11]] > cb)
+                   if(p[pixel[12]] > cb)
+                    if(p[pixel[6]] > cb)
+                     if(p[pixel[5]] > cb)
+                      {}
+                     else
+                      if(p[pixel[14]] > cb)
+                       {}
+                      else
+                       continue;
+                    else
+                     if(p[pixel[14]] > cb)
+                      if(p[pixel[15]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else if(p[pixel[13]] < c_b)
+              if(p[pixel[11]] > cb)
+               if(p[pixel[5]] > cb)
+                if(p[pixel[6]] > cb)
+                 if(p[pixel[7]] > cb)
+                  if(p[pixel[8]] > cb)
+                   if(p[pixel[9]] > cb)
+                    if(p[pixel[10]] > cb)
+                     if(p[pixel[12]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else if(p[pixel[11]] < c_b)
+               if(p[pixel[12]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      if(p[pixel[10]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      if(p[pixel[10]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              if(p[pixel[5]] > cb)
+               if(p[pixel[6]] > cb)
+                if(p[pixel[7]] > cb)
+                 if(p[pixel[8]] > cb)
+                  if(p[pixel[9]] > cb)
+                   if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                     if(p[pixel[12]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else if(p[pixel[4]] < c_b)
+             if(p[pixel[5]] > cb)
+              if(p[pixel[14]] > cb)
+               if(p[pixel[7]] > cb)
+                if(p[pixel[8]] > cb)
+                 if(p[pixel[9]] > cb)
+                  if(p[pixel[10]] > cb)
+                   if(p[pixel[11]] > cb)
+                    if(p[pixel[12]] > cb)
+                     if(p[pixel[13]] > cb)
+                      if(p[pixel[6]] > cb)
+                       {}
+                      else
+                       if(p[pixel[15]] > cb)
+                        {}
+                       else
+                        continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else if(p[pixel[14]] < c_b)
+               if(p[pixel[12]] > cb)
+                if(p[pixel[6]] > cb)
+                 if(p[pixel[7]] > cb)
+                  if(p[pixel[8]] > cb)
+                   if(p[pixel[9]] > cb)
+                    if(p[pixel[10]] > cb)
+                     if(p[pixel[11]] > cb)
+                      if(p[pixel[13]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else if(p[pixel[12]] < c_b)
+                if(p[pixel[13]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      if(p[pixel[10]] < c_b)
+                       if(p[pixel[11]] < c_b)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               if(p[pixel[6]] > cb)
+                if(p[pixel[7]] > cb)
+                 if(p[pixel[8]] > cb)
+                  if(p[pixel[9]] > cb)
+                   if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                     if(p[pixel[12]] > cb)
+                      if(p[pixel[13]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else if(p[pixel[5]] < c_b)
+              if(p[pixel[6]] > cb)
+               if(p[pixel[15]] < c_b)
+                if(p[pixel[13]] > cb)
+                 if(p[pixel[7]] > cb)
+                  if(p[pixel[8]] > cb)
+                   if(p[pixel[9]] > cb)
+                    if(p[pixel[10]] > cb)
+                     if(p[pixel[11]] > cb)
+                      if(p[pixel[12]] > cb)
+                       if(p[pixel[14]] > cb)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else if(p[pixel[13]] < c_b)
+                 if(p[pixel[14]] < c_b)
+                  {}
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                if(p[pixel[7]] > cb)
+                 if(p[pixel[8]] > cb)
+                  if(p[pixel[9]] > cb)
+                   if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                     if(p[pixel[12]] > cb)
+                      if(p[pixel[13]] > cb)
+                       if(p[pixel[14]] > cb)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else if(p[pixel[6]] < c_b)
+               if(p[pixel[7]] > cb)
+                if(p[pixel[14]] > cb)
+                 if(p[pixel[8]] > cb)
+                  if(p[pixel[9]] > cb)
+                   if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                     if(p[pixel[12]] > cb)
+                      if(p[pixel[13]] > cb)
+                       if(p[pixel[15]] > cb)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  continue;
+                else
+                 continue;
+               else if(p[pixel[7]] < c_b)
+                if(p[pixel[8]] < c_b)
+                 {}
+                else
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  continue;
+               else
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[13]] > cb)
+                if(p[pixel[7]] > cb)
+                 if(p[pixel[8]] > cb)
+                  if(p[pixel[9]] > cb)
+                   if(p[pixel[10]] > cb)
+                    if(p[pixel[11]] > cb)
+                     if(p[pixel[12]] > cb)
+                      if(p[pixel[14]] > cb)
+                       if(p[pixel[15]] > cb)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[12]] > cb)
+               if(p[pixel[7]] > cb)
+                if(p[pixel[8]] > cb)
+                 if(p[pixel[9]] > cb)
+                  if(p[pixel[10]] > cb)
+                   if(p[pixel[11]] > cb)
+                    if(p[pixel[13]] > cb)
+                     if(p[pixel[14]] > cb)
+                      if(p[pixel[6]] > cb)
+                       {}
+                      else
+                       if(p[pixel[15]] > cb)
+                        {}
+                       else
+                        continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      if(p[pixel[10]] < c_b)
+                       if(p[pixel[11]] < c_b)
+                        {}
+                       else
+                        continue;
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             if(p[pixel[11]] > cb)
+              if(p[pixel[7]] > cb)
+               if(p[pixel[8]] > cb)
+                if(p[pixel[9]] > cb)
+                 if(p[pixel[10]] > cb)
+                  if(p[pixel[12]] > cb)
+                   if(p[pixel[13]] > cb)
+                    if(p[pixel[6]] > cb)
+                     if(p[pixel[5]] > cb)
+                      {}
+                     else
+                      if(p[pixel[14]] > cb)
+                       {}
+                      else
+                       continue;
+                    else
+                     if(p[pixel[14]] > cb)
+                      if(p[pixel[15]] > cb)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else if(p[pixel[11]] < c_b)
+              if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      if(p[pixel[10]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      if(p[pixel[10]] < c_b)
+                       {}
+                      else
+                       continue;
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+           else
+            if(p[pixel[10]] > cb)
+             if(p[pixel[7]] > cb)
+              if(p[pixel[8]] > cb)
+               if(p[pixel[9]] > cb)
+                if(p[pixel[11]] > cb)
+                 if(p[pixel[12]] > cb)
+                  if(p[pixel[6]] > cb)
+                   if(p[pixel[5]] > cb)
+                    if(p[pixel[4]] > cb)
+                     {}
+                    else
+                     if(p[pixel[13]] > cb)
+                      {}
+                     else
+                      continue;
+                   else
+                    if(p[pixel[13]] > cb)
+                     if(p[pixel[14]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                  else
+                   if(p[pixel[13]] > cb)
+                    if(p[pixel[14]] > cb)
+                     if(p[pixel[15]] > cb)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+            else if(p[pixel[10]] < c_b)
+             if(p[pixel[11]] < c_b)
+              if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     if(p[pixel[9]] < c_b)
+                      {}
+                     else
+                      continue;
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+          else
+           if(p[pixel[9]] > cb)
+            if(p[pixel[7]] > cb)
+             if(p[pixel[8]] > cb)
+              if(p[pixel[10]] > cb)
+               if(p[pixel[11]] > cb)
+                if(p[pixel[6]] > cb)
+                 if(p[pixel[5]] > cb)
+                  if(p[pixel[4]] > cb)
+                   if(p[pixel[3]] > cb)
+                    {}
+                   else
+                    if(p[pixel[12]] > cb)
+                     {}
+                    else
+                     continue;
+                  else
+                   if(p[pixel[12]] > cb)
+                    if(p[pixel[13]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                 else
+                  if(p[pixel[12]] > cb)
+                   if(p[pixel[13]] > cb)
+                    if(p[pixel[14]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[12]] > cb)
+                  if(p[pixel[13]] > cb)
+                   if(p[pixel[14]] > cb)
+                    if(p[pixel[15]] > cb)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+            else
+             continue;
+           else if(p[pixel[9]] < c_b)
+            if(p[pixel[10]] < c_b)
+             if(p[pixel[11]] < c_b)
+              if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] < c_b)
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    if(p[pixel[8]] < c_b)
+                     {}
+                    else
+                     continue;
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              continue;
+            else
+             continue;
+           else
+            continue;
+         else
+          if(p[pixel[8]] > cb)
+           if(p[pixel[7]] > cb)
+            if(p[pixel[9]] > cb)
+             if(p[pixel[10]] > cb)
+              if(p[pixel[6]] > cb)
+               if(p[pixel[5]] > cb)
+                if(p[pixel[4]] > cb)
+                 if(p[pixel[3]] > cb)
+                  if(p[pixel[2]] > cb)
+                   {}
+                  else
+                   if(p[pixel[11]] > cb)
+                    {}
+                   else
+                    continue;
+                 else
+                  if(p[pixel[11]] > cb)
+                   if(p[pixel[12]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                   if(p[pixel[13]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[11]] > cb)
+                 if(p[pixel[12]] > cb)
+                  if(p[pixel[13]] > cb)
+                   if(p[pixel[14]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[11]] > cb)
+                if(p[pixel[12]] > cb)
+                 if(p[pixel[13]] > cb)
+                  if(p[pixel[14]] > cb)
+                   if(p[pixel[15]] > cb)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              continue;
+            else
+             continue;
+           else
+            continue;
+          else if(p[pixel[8]] < c_b)
+           if(p[pixel[9]] < c_b)
+            if(p[pixel[10]] < c_b)
+             if(p[pixel[11]] < c_b)
+              if(p[pixel[12]] < c_b)
+               if(p[pixel[13]] < c_b)
+                if(p[pixel[14]] < c_b)
+                 if(p[pixel[15]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                else
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[3]] < c_b)
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[2]] < c_b)
+               if(p[pixel[3]] < c_b)
+                if(p[pixel[4]] < c_b)
+                 if(p[pixel[5]] < c_b)
+                  if(p[pixel[6]] < c_b)
+                   if(p[pixel[7]] < c_b)
+                    {}
+                   else
+                    continue;
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             continue;
+           else
+            continue;
+          else
+           continue;
+        else
+         if(p[pixel[7]] > cb)
+          if(p[pixel[8]] > cb)
+           if(p[pixel[9]] > cb)
+            if(p[pixel[6]] > cb)
+             if(p[pixel[5]] > cb)
+              if(p[pixel[4]] > cb)
+               if(p[pixel[3]] > cb)
+                if(p[pixel[2]] > cb)
+                 if(p[pixel[1]] > cb)
+                  {}
+                 else
+                  if(p[pixel[10]] > cb)
+                   {}
+                  else
+                   continue;
+                else
+                 if(p[pixel[10]] > cb)
+                  if(p[pixel[11]] > cb)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[10]] > cb)
+                 if(p[pixel[11]] > cb)
+                  if(p[pixel[12]] > cb)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[10]] > cb)
+                if(p[pixel[11]] > cb)
+                 if(p[pixel[12]] > cb)
+                  if(p[pixel[13]] > cb)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[10]] > cb)
+               if(p[pixel[11]] > cb)
+                if(p[pixel[12]] > cb)
+                 if(p[pixel[13]] > cb)
+                  if(p[pixel[14]] > cb)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             if(p[pixel[10]] > cb)
+              if(p[pixel[11]] > cb)
+               if(p[pixel[12]] > cb)
+                if(p[pixel[13]] > cb)
+                 if(p[pixel[14]] > cb)
+                  if(p[pixel[15]] > cb)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+           else
+            continue;
+          else
+           continue;
+         else if(p[pixel[7]] < c_b)
+          if(p[pixel[8]] < c_b)
+           if(p[pixel[9]] < c_b)
+            if(p[pixel[6]] < c_b)
+             if(p[pixel[5]] < c_b)
+              if(p[pixel[4]] < c_b)
+               if(p[pixel[3]] < c_b)
+                if(p[pixel[2]] < c_b)
+                 if(p[pixel[1]] < c_b)
+                  {}
+                 else
+                  if(p[pixel[10]] < c_b)
+                   {}
+                  else
+                   continue;
+                else
+                 if(p[pixel[10]] < c_b)
+                  if(p[pixel[11]] < c_b)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+               else
+                if(p[pixel[10]] < c_b)
+                 if(p[pixel[11]] < c_b)
+                  if(p[pixel[12]] < c_b)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+              else
+               if(p[pixel[10]] < c_b)
+                if(p[pixel[11]] < c_b)
+                 if(p[pixel[12]] < c_b)
+                  if(p[pixel[13]] < c_b)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+             else
+              if(p[pixel[10]] < c_b)
+               if(p[pixel[11]] < c_b)
+                if(p[pixel[12]] < c_b)
+                 if(p[pixel[13]] < c_b)
+                  if(p[pixel[14]] < c_b)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+            else
+             if(p[pixel[10]] < c_b)
+              if(p[pixel[11]] < c_b)
+               if(p[pixel[12]] < c_b)
+                if(p[pixel[13]] < c_b)
+                 if(p[pixel[14]] < c_b)
+                  if(p[pixel[15]] < c_b)
+                   {}
+                  else
+                   continue;
+                 else
+                  continue;
+                else
+                 continue;
+               else
+                continue;
+              else
+               continue;
+             else
+              continue;
+           else
+            continue;
+          else
+           continue;
+         else
+          continue;
+			if(num_corners == rsize)
+			{
+				rsize*=2;
+				ret_corners = (xy*)realloc(ret_corners, sizeof(xy)*rsize);
+			}
+			ret_corners[num_corners].x = x;
+			ret_corners[num_corners].y = y;
+			num_corners++;
+
+		}
+
+	*ret_num_corners = num_corners;
+	return ret_corners;
+
+}
+
+// clang-format on

diff --git a/third_party/fastfeat/nonmax.c b/third_party/fastfeat/nonmax.c
new file mode 100644
index 0000000..e4f2911
--- /dev/null
+++ b/third_party/fastfeat/nonmax.c

@@ -0,0 +1,119 @@
+// clang-format off
+#include <stdlib.h>
+#include "fast.h"
+
+
+#define Compare(X, Y) ((X)>=(Y))
+
+xy* nonmax_suppression(const xy* corners, const int* scores, int num_corners, int* ret_num_nonmax)
+{
+	int num_nonmax=0;
+	int last_row;
+	int* row_start;
+	int i, j;
+	xy* ret_nonmax;
+	const int sz = (int)num_corners;
+
+	/*Point above points (roughly) to the pixel above the one of interest, if there
+    is a feature there.*/
+	int point_above = 0;
+	int point_below = 0;
+
+
+	if(num_corners < 1)
+	{
+		*ret_num_nonmax = 0;
+		return 0;
+	}
+
+	ret_nonmax = (xy*)malloc(num_corners * sizeof(xy));
+
+	/* Find where each row begins
+	   (the corners are output in raster scan order). A beginning of -1 signifies
+	   that there are no corners on that row. */
+	last_row = corners[num_corners-1].y;
+	row_start = (int*)malloc((last_row+1)*sizeof(int));
+
+	for(i=0; i < last_row+1; i++)
+		row_start[i] = -1;
+
+	{
+		int prev_row = -1;
+		for(i=0; i< num_corners; i++)
+			if(corners[i].y != prev_row)
+			{
+				row_start[corners[i].y] = i;
+				prev_row = corners[i].y;
+			}
+	}
+
+
+
+	for(i=0; i < sz; i++)
+	{
+		int score = scores[i];
+		xy pos = corners[i];
+
+		/*Check left */
+		if(i > 0)
+			if(corners[i-1].x == pos.x-1 && corners[i-1].y == pos.y && Compare(scores[i-1], score))
+				continue;
+
+		/*Check right*/
+		if(i < (sz - 1))
+			if(corners[i+1].x == pos.x+1 && corners[i+1].y == pos.y && Compare(scores[i+1], score))
+				continue;
+
+		/*Check above (if there is a valid row above)*/
+		if(pos.y != 0 && row_start[pos.y - 1] != -1)
+		{
+			/*Make sure that current point_above is one
+			  row above.*/
+			if(corners[point_above].y < pos.y - 1)
+				point_above = row_start[pos.y-1];
+
+			/*Make point_above point to the first of the pixels above the current point,
+			  if it exists.*/
+			for(; corners[point_above].y < pos.y && corners[point_above].x < pos.x - 1; point_above++)
+			{}
+
+
+			for(j=point_above; corners[j].y < pos.y && corners[j].x <= pos.x + 1; j++)
+			{
+				int x = corners[j].x;
+				if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j], score))
+					goto cont;
+			}
+
+		}
+
+		/*Check below (if there is anything below)*/
+		if(pos.y != last_row && row_start[pos.y + 1] != -1 && point_below < sz) /*Nothing below*/
+		{
+			if(corners[point_below].y < pos.y + 1)
+				point_below = row_start[pos.y+1];
+
+			/* Make point below point to one of the pixels belowthe current point, if it
+			   exists.*/
+			for(; point_below < sz && corners[point_below].y == pos.y+1 && corners[point_below].x < pos.x - 1; point_below++)
+			{}
+
+			for(j=point_below; j < sz && corners[j].y == pos.y+1 && corners[j].x <= pos.x + 1; j++)
+			{
+				int x = corners[j].x;
+				if( (x == pos.x - 1 || x ==pos.x || x == pos.x+1) && Compare(scores[j],score))
+					goto cont;
+			}
+		}
+
+		ret_nonmax[num_nonmax++] = corners[i];
+		cont:
+			;
+	}
+
+	free(row_start);
+	*ret_num_nonmax = num_nonmax;
+	return ret_nonmax;
+}
+
+// clang-format on

diff --git a/webmenc.cc b/webmenc.cc
index f78f027..e3d209a 100644
--- a/webmenc.cc
+++ b/webmenc.cc

@@ -24,7 +24,6 @@
 
 void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const aom_codec_enc_cfg_t *cfg,
-                            const struct aom_rational *fps,
                             stereo_format_t stereo_fmt, unsigned int fourcc,
                             const struct AvxRational *par) {
   mkvmuxer::MkvWriter *const writer = new mkvmuxer::MkvWriter(webm_ctx->stream);

diff --git a/webmenc.h b/webmenc.h
index 90211ff..74387fb 100644
--- a/webmenc.h
+++ b/webmenc.h

@@ -40,7 +40,6 @@
 
 void write_webm_file_header(struct WebmOutputContext *webm_ctx,
                             const aom_codec_enc_cfg_t *cfg,
-                            const struct aom_rational *fps,
                             stereo_format_t stereo_fmt, unsigned int fourcc,
                             const struct AvxRational *par);
commit	c1c6f6a2dde99d6ea90c34e2e2f1c28e98976f98	[log] [tgz]
author	Yaowu Xu <yaowu@google.com>	Tue Nov 08 11:24:01 2016 -0800
committer	Yaowu Xu <yaowu@google.com>	Tue Nov 08 11:31:16 2016 -0800
tree	f06404f4e84c1384dfffdc640957c3a2600471d2
parent	6515afc6b90751d86b3e2f77336cd374d80896d7 [diff]
parent	8b0f636831d18ed770fc9b2fbd9098cbb616251e [diff]